1#!/usr/bin/perl -w 2 3%xrefs = ( 4 "SP_explicit" => "swissprot", 5 "SP_implicit" => "swissprot", 6 "SP_CC" => "swissprot", 7 "SP_FT" => "swissprot", 8 "SP_lit" => "swissprot", 9 "EMBL_DR" => "embl", 10 "EMBL_explicit" => "embl", 11 "Other" => "Tax_id, EC_number, etc.", 12 ); 13 14%knownfmt = ( 15 "Text" => 0, 16 "HTML" => 0, 17 "XML" => 0, 18 ); 19 20@required = ("ID", "Name", "Desc"); 21@preferred = ("URL", "Query", "Example", "Email", "Status"); 22 23 24sub checkid() { 25 my $qid; 26 my $e; 27 my $qcnt = 0; 28 foreach $qid (sort(keys(%qid))){ 29 $qcnt++; 30 if(!defined($xid{$qid})) { 31 print STDERR "$iline: QRYID-NOEXAMPLE ($qid{$qid}): '$qid'\n"; 32 $noexample{$qid} += $qid{$qid}; 33 } 34 } 35 foreach $e (sort(keys(%edamdat))){ 36 if(!defined($qdat{$e})) { 37 print STDERR "$iline: EDAMDAT-UNUSED: '$e'\n"; 38 } 39 } 40 foreach $e (sort(keys(%edamid))){ 41 if(!defined($qid{$e}) && !defined($xid{$e})) { 42 print STDERR "$iline: EDAMID-UNUSED: '$e'\n"; 43 } 44 } 45 foreach $e (sort(keys(%edamfmt))){ 46 if(!defined($qfmt{$e})) { 47 print STDERR "$iline: EDAMFMT-UNUSED: '$e'\n"; 48 } 49 } 50 # missing lines 51 if(!keys(%qid)) { 52 print STDERR "$iline: NOLINE-QUERY no query for: $did\n"; 53 } 54 if(!keys(%xid)) { 55 print STDERR "$iline: NOLINE-EXAMPLE no example for: $did\n"; 56 } 57 if(!keys(%oid)) { 58 print STDERR "$iline: NOLINE-TAXON no taxon for: $did\n"; 59 } 60 if(keys(%rid)) { 61 if(!$qcnt) { 62 print STDERR "$iline: XREF-NO-QUERY xref but no query for: $did\n"; 63 } 64 } 65} 66 67foreach $p (@preferred) {$preferred{$p}=1} 68foreach $r (@required) {$required{$r}=1} 69 70open(EDAM, "/homes/pmr/devemboss/emboss/data/EDAM.obo") || die "Cannot open EDAM.obo"; 71 72$isterm = 0; 73$nterms = 0; 74while (<EDAM>){ 75 if(/^[\[]([^\]]+)/) { 76 if($1 eq "Term") {$isterm = 1} 77 else {$isterm = 0} 78 } 79 if(!$isterm) {next} 80 81 if(/^id: EDAM:(\d+)/) { 82 $id = $1; 83 $ntrms++; 84 } 85 elsif(/^alt_id: EDAM:(\d+)/) { 86 $altid = $1; 87 $trueid{$altid} = $id; 88 } 89 elsif(/^name: ([^\n\!]+)/) { 90 $name = $1; 91 $name =~ s/\s+$//g; 92 $edam{$id} = $name; 93 } 94 elsif(/^namespace: ([^\n\!]+)/) { 95 $namespace = $1; 96 $namespace =~ s/\s+$//g; 97 $edamspace{$id} = $namespace; 98 } 99 elsif(/^is_obsolete: (\S+)/) { 100 $obs = $1; 101 if($obs eq "true") {$isobs{$id}=1} 102 else {print STDERR "obsolete '$obs' for id '$id'\n"} 103 } 104} 105close EDAM; 106 107# dbxref.txt from ftp://ftp.ebi.ac.uk/ 108# pub/databases/uniprot/current_release/knowledgebase/complete/docs/dbxref.txt 109 110open(DBXREF, "/homes/pmr/devemboss/emboss/data/dbxref.txt") || die "Cannot open dbxref.txt"; 111 112$isdata = 0; 113while(<DBXREF>){ 114 if($isdata){ 115 if(/^[-]+$/) {$isdata = 0;next} 116 if(/^$/) {next} 117 elsif(/^AC : (\S+)/) { 118 $xrac = $1; 119 $xrac{$xrac} = $xrac; 120 } 121 elsif(/^Abbrev: (.*)$/) { 122 $xrid = $1; 123 $xrid{$xrid} = $xrac; 124 $xrac{$xrac} = $xrid; 125 } 126 elsif(/^LinkTp: (.*)$/) { 127 $xlink{$xrid} = $1; 128 } 129 elsif(/^Server: (.*)$/) { 130 $xserver{$xrid} = $1; 131 } 132 elsif(/^Db_URL: (.*)$/) { 133 $xurl{$xrid} = $1; 134 } 135 elsif(/^Cat : (.*)$/) { 136 $xurl{$xrid} = $1; 137 } 138 elsif(/^Name : (.*)$/) { 139 $xname{$xrid} = $1; 140 } 141 elsif(/^Ref : (.*)$/) { 142 $xcit{$xrid} = $1; 143 } 144 elsif(/^Note : (.*)$/) { 145 $nore{$xrid} = $1; 146 } 147 elsif(/^ (\S.*)$/) { 148 $continue{$xrid} = $1; 149 } 150 else { 151 print STDERR "DBXREF-BADLINE: $_"; 152 } 153 } 154 if(/^[_]+$/) {$isdata = 1} 155} 156 157close DBXREF; 158 159open(DRNEW, ">DRCAT.new") || die "Cannot open DRCAT.new"; 160open(DRCAT, "/homes/pmr/devemboss/emboss/data/DRCAT.dat") || die "Cannot open DRCAT.dat"; 161 162open(DRNEW, ">DRCAT.new") || die "Cannot open DRCAT.new"; 163 164$line = 0; 165$iline = 0; 166while (<DRCAT>) { 167 $line++; 168 $keep = 1; 169 if(/^[\#]/){print DRNEW; next} 170 if(/^\s*$/){print DRNEW; next} 171 172 if(/^(\S+) +([^\n]+)/){ 173 $pref = $1; 174 $rest = $2; 175 if($rest eq "None" || $rest eq "Unknown" ) { 176 if($required{$pref}) { 177 print STDERR "$line: REQUIRED: $_"; 178 $keep = 0; 179 } 180 elsif($preferred{$pref}) { 181 print STDERR "$line: PREFERRED: $_"; 182 $keep = 0; 183 } 184 else { 185 if($rest eq "None" && $pref eq "Taxon") { 186 $oid{$did}++; 187 } 188 else { 189 print STDERR "$line: EMPTY: $_"; 190 } 191 $keep = 0; 192 } 193 } 194 else { 195 if($rest =~ /\s\s\s+$/) { 196 print STDERR "$line: SPACES: $_"; 197 } 198 if($pref =~ /^ID$/) { 199 if($iline) {checkid()} 200 $did = $rest; 201 %edamid = (); 202 %edamdat = (); 203 %edamfmt = (); 204 %edamtpc = (); 205 %qdat = (); 206 %qfmt = (); 207 %qid = (); 208 %oid = (); 209 %rid = (); 210 %xid = (); 211 $contact = "unknown-contact"; 212 $email = "unknown-email"; 213 $iline = $line; 214 } 215 elsif($pref =~ /^Taxon/) { 216 $oid{$did}++; 217 } 218 elsif($pref =~ /^Acc/) { 219 $dac = $rest; 220 $catid{$did} = $dac; 221 $catac{$dac} = $did; 222 } 223 elsif($pref =~ /^Contact/) { 224 $contact = $rest; 225 } 226 elsif($pref =~ /^Email/) { 227 $email = $rest; 228 } 229 elsif($pref =~ /^Desc/) { 230 if($rest =~ /[\|]/) { 231 print STDERR "$line: BAD-DESC: $_"; 232 } 233 } 234 elsif($pref =~ /^Xref/) { 235 ($type, $ids) = ($rest =~ /^(\S+) [\|] (.*)/); 236 if(!defined($xrefs{$type})){ 237 print STDERR "$line: XREF-TYPE $type: $_"; 238 } 239 elsif($type eq "SP_explicit") { 240 if($ids =~ /[Nn]one/) { 241 print STDERR "$line: XREF-NONE ($did)\n"; 242 } 243 } 244 $rid{$type}++; 245 } 246 elsif($pref =~ /^Query/) { 247 $rest =~ s/ [\{][^\}]*[\}]//g; 248 ($data, $fmt, $ids, $url, $xtra) = split(/ [\| ] /, $rest); 249 if($data eq "Unknown" && $id eq "Unknown" && $fmt eq "Unknown" && $url eq "Unknown") { 250 $keep = 0; 251 print STDERR "$line: skip\n"; 252 } 253 if(!defined($url) || defined($xtra)){ 254 print STDERR "$line: BAD-QUERY: $_"; 255 } 256 257 $qdat{$data}++; 258 if(!keys(%edamdat)) { 259 print STDERR "$line: QRY EDAMdat MISSING '$data': $_"; 260 } 261 elsif(!defined($edamdat{$data})) { 262 print STDERR "$line: QRY DATA '$data': $_"; 263 } 264 265 $qfmt{$fmt}++; 266 if(!defined($edamfmt{$fmt}) && !defined($knownfmt{$fmt})) { 267 print STDERR "$line: QRY FORMAT '$fmt': $_"; 268 } 269 270 @ids = split(/;/, $ids); 271 foreach $qid (@ids) { 272 if(!defined($edamid{$qid})) { 273 if(defined($edamdat{$qid})) { 274 print STDERR "$line: QRY ID=DAT '$qid': $_"; 275 } 276 elsif ($qid eq "None") {} 277 else { 278 print STDERR "$line: QRY ID '$qid': $_"; 279 $badqid{$qid}++; 280 } 281 } 282 $qid{$qid}++; 283 } 284 } 285 elsif($pref =~ /^Example/) { 286 $rest =~ s/ [\{][^\}]*[\}]//g; 287 ($ids, $values) = split(/ [\| ] /, $rest); 288 if($data eq "Unknown" && $id eq "Unknown" && $fmt eq "Unknown" && $url eq "Unknown") { 289 $keep = 0; 290 print STDERR "$line: skip\n"; 291 } 292 if(!defined($values)){ 293 print STDERR "$line: BAD-QUERY: $_"; 294 } 295 296 @ids = split(/;/, $ids); 297 foreach $xid (@ids) { 298 if(!defined($qid{$xid})) { 299 print STDERR "$line: EXAMPLE ID NO QRY '$xid': $_"; 300 } 301 elsif(!defined($edamid{$xid})) { 302 if(defined($edamdat{$xid})) { 303 print STDERR "$line: EXAMPLE ID=DAT '$xid': $_"; 304 } 305 else { 306 print STDERR "$line: EXAMPLE ID '$xid': $_"; 307 $badxid{xqid}++; 308 } 309 } 310 $xid{$xid}++; 311 } 312 @values = split(/;/, $values); 313 if($#values != $#ids) { 314 print STDERR "$line: EXAMPLE $#ids ids $#values values: $_"; 315 } 316 foreach $xval (@values) { 317 } 318 } 319 elsif($pref =~ /^EDAM/) { 320 ($term, $name) = ($rest =~ /^(\d+) [\|] (.*)/); 321 if(!defined($term)) { 322 print STDERR "$line: BAD LINE: $_"; 323 } 324 elsif(defined($trueid{$term})) { 325 $newterm = $trueid{$term}; 326 print STDERR "$line: EDAM-ALTID $trueid{$term} '$edam{$newterm}' $_"; 327 } 328 elsif ($pref eq "EDAMtpc") { 329 $edamtpc{$name} = $term; 330 if(!defined($edam{$term})) { 331 print STDERR "$line: NOT-IN-EDAM: $_"; 332 } 333 elsif($edamspace{$term} ne "topic") { 334 print STDERR "$line: EDAM-NAMESPACE $edamspace{$term}: $_"; 335 } 336 } 337 elsif ($pref eq "EDAMfmt") { 338 $edamfmt{$name} = $term; 339 if(!defined($edam{$term})) { 340 print STDERR "$line: NOT-IN-EDAM: $_"; 341 } 342 elsif($edamspace{$term} ne "format") { 343 print STDERR "$line: EDAM-NAMESPACE $edamspace{$term}: $_"; 344 } 345 } 346 elsif ($pref eq "EDAMdat") { 347 $edamdat{$name} = $term; 348 if(!defined($edam{$term})) { 349 print STDERR "$line: NOT-IN-EDAM: $_"; 350 } 351 elsif($edamspace{$term} ne "data") { 352 print STDERR "$line: EDAM-NAMESPACE $edamspace{$term}: $_"; 353 } 354 } 355 elsif ($pref eq "EDAMid") { 356 $edamid{$name} = $term; 357 if(!defined($edam{$term})) { 358 print STDERR "$line: NOT-IN-EDAM: $_"; 359 } 360 elsif($edamspace{$term} ne "identifier") { 361 print STDERR "$line: EDAM-NAMESPACE $edamspace{$term}: $_"; 362 } 363 } 364 elsif ($pref eq "EDAMres") { 365 print STDERR "$line: EDAM-RESOURCE $edamspace{$term}: $_"; 366 } 367 if($isobs{$term}) { 368 print STDERR "$line: EDAM-OBSOLETE: $_"; 369 } 370 if(defined($edam{$term}) && ($edam{$term} ne $name)) { 371 print STDERR "$line: EDAM-NAME '$edam{$term}': $_"; 372 } 373 } 374 } 375 } 376 if($keep) {print DRNEW} 377} 378 379close DRCAT; 380checkid(); 381 382sub numqid() 383{ 384 my $ret = ($badqid{$a} <=> $badqid{$b}); 385 if(!$ret) {$ret = $a cmp $b}; 386 $ret; 387} 388sub numxqid() 389{ 390 my $ret = ($noexample{$a} <=> $noexample{$b}); 391 if(!$ret) {$ret = $a cmp $b}; 392 $ret; 393} 394print STDERR "\nSummary:\n\n"; 395print STDERR "\nUndefined query id:\n\n"; 396foreach $b (sort numqid (keys(%badqid))) { 397 printf STDERR "%6d %s\n", $badqid{$b}, $b; 398} 399print STDERR "\nNo example for query id:\n\n"; 400foreach $qi (sort numxqid (keys(%noexample))) { 401 printf STDERR "%6d %s\n", $noexample{$qi}, $qi; 402} 403 404foreach $x(sort(keys(%catac))) { 405 if(!defined($xrac{$x})) { 406 print STDERR "DB-BADAC: $x $catac{$x}\n"; 407 } 408} 409foreach $x(sort(keys(%xrac))) { 410 if(!defined($catac{$x})) { 411 print STDERR "NEWDB-AC: $x $xrac{$x}\n"; 412 } 413} 414foreach $x(sort(keys(%xrid))) { 415 if(!defined($catid{$x})) { 416 print STDERR "NEWDB-ID: $xrid{$x} $x\n"; 417 } 418} 419