1#!/usr/bin/perl -w
2
3%xrefs = (
4    "SP_explicit" => "swissprot",
5    "SP_implicit" => "swissprot",
6    "SP_CC" => "swissprot",
7    "SP_FT" => "swissprot",
8    "SP_lit" => "swissprot",
9    "EMBL_DR" => "embl",
10    "EMBL_explicit" => "embl",
11    "Other" => "Tax_id, EC_number, etc.",
12    );
13
14%knownfmt = (
15    "Text" => 0,
16    "HTML" => 0,
17    "XML" => 0,
18    );
19
20@required = ("ID", "Name", "Desc");
21@preferred = ("URL", "Query", "Example", "Email", "Status");
22
23
24sub checkid() {
25    my $qid;
26    my $e;
27    my $qcnt = 0;
28    foreach $qid (sort(keys(%qid))){
29	$qcnt++;
30	if(!defined($xid{$qid})) {
31	    print STDERR "$iline: QRYID-NOEXAMPLE ($qid{$qid}): '$qid'\n";
32	    $noexample{$qid} += $qid{$qid};
33	}
34    }
35    foreach $e (sort(keys(%edamdat))){
36	if(!defined($qdat{$e})) {
37	    print STDERR "$iline: EDAMDAT-UNUSED: '$e'\n";
38	}
39    }
40    foreach $e (sort(keys(%edamid))){
41	if(!defined($qid{$e}) && !defined($xid{$e})) {
42	    print STDERR "$iline: EDAMID-UNUSED: '$e'\n";
43	}
44    }
45    foreach $e (sort(keys(%edamfmt))){
46	if(!defined($qfmt{$e})) {
47	    print STDERR "$iline: EDAMFMT-UNUSED: '$e'\n";
48	}
49    }
50    # missing lines
51    if(!keys(%qid)) {
52	print STDERR "$iline: NOLINE-QUERY no query for: $did\n";
53    }
54    if(!keys(%xid)) {
55	print STDERR "$iline: NOLINE-EXAMPLE no example for: $did\n";
56    }
57    if(!keys(%oid)) {
58	print STDERR "$iline: NOLINE-TAXON no taxon for: $did\n";
59    }
60    if(keys(%rid)) {
61	if(!$qcnt) {
62	    print STDERR "$iline: XREF-NO-QUERY xref but no query for: $did\n";
63	}
64    }
65}
66
67foreach $p (@preferred) {$preferred{$p}=1}
68foreach $r (@required)  {$required{$r}=1}
69
70open(EDAM, "/homes/pmr/devemboss/emboss/data/EDAM.obo") || die "Cannot open EDAM.obo";
71
72$isterm = 0;
73$nterms = 0;
74while (<EDAM>){
75    if(/^[\[]([^\]]+)/) {
76	if($1 eq "Term") {$isterm = 1}
77	else {$isterm = 0}
78    }
79    if(!$isterm) {next}
80
81    if(/^id: EDAM:(\d+)/) {
82	$id = $1;
83	$ntrms++;
84    }
85    elsif(/^alt_id: EDAM:(\d+)/) {
86	$altid = $1;
87	$trueid{$altid} = $id;
88    }
89    elsif(/^name: ([^\n\!]+)/) {
90	$name = $1;
91	$name =~ s/\s+$//g;
92	$edam{$id} = $name;
93    }
94    elsif(/^namespace: ([^\n\!]+)/) {
95	$namespace = $1;
96	$namespace =~ s/\s+$//g;
97	$edamspace{$id} = $namespace;
98    }
99    elsif(/^is_obsolete: (\S+)/) {
100	$obs = $1;
101	if($obs eq "true") {$isobs{$id}=1}
102	else {print STDERR "obsolete '$obs' for id '$id'\n"}
103    }
104}
105close EDAM;
106
107# dbxref.txt from ftp://ftp.ebi.ac.uk/
108# pub/databases/uniprot/current_release/knowledgebase/complete/docs/dbxref.txt
109
110open(DBXREF, "/homes/pmr/devemboss/emboss/data/dbxref.txt") || die "Cannot open dbxref.txt";
111
112$isdata = 0;
113while(<DBXREF>){
114    if($isdata){
115	if(/^[-]+$/) {$isdata = 0;next}
116	if(/^$/) {next}
117	elsif(/^AC    : (\S+)/) {
118	    $xrac = $1;
119	    $xrac{$xrac} = $xrac;
120	}
121	elsif(/^Abbrev: (.*)$/) {
122	    $xrid = $1;
123	    $xrid{$xrid} = $xrac;
124	    $xrac{$xrac} = $xrid;
125	}
126	elsif(/^LinkTp: (.*)$/) {
127	    $xlink{$xrid} = $1;
128	}
129	elsif(/^Server: (.*)$/) {
130	    $xserver{$xrid} = $1;
131	}
132	elsif(/^Db_URL: (.*)$/) {
133	    $xurl{$xrid} = $1;
134	}
135	elsif(/^Cat   : (.*)$/) {
136	    $xurl{$xrid} = $1;
137	}
138	elsif(/^Name  : (.*)$/) {
139	    $xname{$xrid} = $1;
140	}
141	elsif(/^Ref   : (.*)$/) {
142	    $xcit{$xrid} = $1;
143	}
144	elsif(/^Note  : (.*)$/) {
145	    $nore{$xrid} = $1;
146	}
147	elsif(/^        (\S.*)$/) {
148	    $continue{$xrid} = $1;
149	}
150	else {
151	    print STDERR "DBXREF-BADLINE: $_";
152	}
153    }
154    if(/^[_]+$/) {$isdata = 1}
155}
156
157close DBXREF;
158
159open(DRNEW, ">DRCAT.new") || die "Cannot open DRCAT.new";
160open(DRCAT, "/homes/pmr/devemboss/emboss/data/DRCAT.dat") || die "Cannot open DRCAT.dat";
161
162open(DRNEW, ">DRCAT.new") || die "Cannot open DRCAT.new";
163
164$line = 0;
165$iline = 0;
166while (<DRCAT>) {
167    $line++;
168    $keep = 1;
169    if(/^[\#]/){print DRNEW; next}
170    if(/^\s*$/){print DRNEW; next}
171
172    if(/^(\S+) +([^\n]+)/){
173	$pref = $1;
174	$rest = $2;
175	if($rest eq "None" || $rest eq "Unknown" ) {
176	    if($required{$pref}) {
177		print STDERR "$line:  REQUIRED: $_";
178		$keep = 0;
179	    }
180	    elsif($preferred{$pref}) {
181		print STDERR "$line: PREFERRED: $_";
182		$keep = 0;
183	    }
184	    else {
185		if($rest eq "None" && $pref eq "Taxon") {
186		    $oid{$did}++;
187		}
188		else {
189		    print STDERR "$line:     EMPTY: $_";
190		}
191		$keep = 0;
192	    }
193	}
194	else {
195	    if($rest =~ /\s\s\s+$/) {
196		print STDERR "$line: SPACES: $_";
197	    }
198	    if($pref =~ /^ID$/) {
199		if($iline) {checkid()}
200		$did = $rest;
201		%edamid = ();
202		%edamdat = ();
203		%edamfmt = ();
204		%edamtpc = ();
205		%qdat = ();
206		%qfmt = ();
207		%qid = ();
208		%oid = ();
209		%rid = ();
210		%xid = ();
211		$contact = "unknown-contact";
212		$email = "unknown-email";
213		$iline = $line;
214	    }
215	    elsif($pref =~ /^Taxon/) {
216		$oid{$did}++;
217	    }
218	    elsif($pref =~ /^Acc/) {
219		$dac = $rest;
220		$catid{$did} = $dac;
221		$catac{$dac} = $did;
222	    }
223	    elsif($pref =~ /^Contact/) {
224		$contact = $rest;
225	    }
226	    elsif($pref =~ /^Email/) {
227		$email = $rest;
228	    }
229	    elsif($pref =~ /^Desc/) {
230		if($rest =~ /[\|]/) {
231		    print STDERR "$line: BAD-DESC: $_";
232		}
233	    }
234	    elsif($pref =~ /^Xref/) {
235		($type, $ids) = ($rest =~ /^(\S+) [\|] (.*)/);
236		if(!defined($xrefs{$type})){
237		    print STDERR "$line: XREF-TYPE $type: $_";
238		}
239		elsif($type eq "SP_explicit") {
240		    if($ids =~ /[Nn]one/) {
241		    print STDERR "$line: XREF-NONE ($did)\n";
242		    }
243		}
244		$rid{$type}++;
245	    }
246	    elsif($pref =~ /^Query/) {
247		$rest =~ s/ [\{][^\}]*[\}]//g;
248		($data, $fmt, $ids, $url, $xtra) = split(/ [\| ] /, $rest);
249		if($data eq "Unknown" && $id eq "Unknown" && $fmt eq "Unknown" && $url eq "Unknown") {
250		    $keep = 0;
251		    print STDERR "$line: skip\n";
252		}
253		if(!defined($url) || defined($xtra)){
254		    print STDERR "$line: BAD-QUERY: $_";
255		}
256
257		$qdat{$data}++;
258		if(!keys(%edamdat)) {
259			print STDERR "$line: QRY EDAMdat MISSING '$data': $_";
260		}
261		elsif(!defined($edamdat{$data})) {
262			print STDERR "$line: QRY DATA '$data': $_";
263		}
264
265		$qfmt{$fmt}++;
266		if(!defined($edamfmt{$fmt}) && !defined($knownfmt{$fmt})) {
267			print STDERR "$line: QRY FORMAT '$fmt': $_";
268		}
269
270		@ids = split(/;/, $ids);
271		foreach $qid (@ids) {
272		    if(!defined($edamid{$qid})) {
273			if(defined($edamdat{$qid})) {
274			    print STDERR "$line: QRY ID=DAT '$qid': $_";
275			}
276			elsif ($qid eq "None") {}
277			else {
278			    print STDERR "$line: QRY ID '$qid': $_";
279			    $badqid{$qid}++;
280			}
281		    }
282		    $qid{$qid}++;
283		}
284	    }
285	    elsif($pref =~ /^Example/) {
286		$rest =~ s/ [\{][^\}]*[\}]//g;
287		($ids, $values) = split(/ [\| ] /, $rest);
288		if($data eq "Unknown" && $id eq "Unknown" && $fmt eq "Unknown" && $url eq "Unknown") {
289		    $keep = 0;
290		    print STDERR "$line: skip\n";
291		}
292		if(!defined($values)){
293		    print STDERR "$line: BAD-QUERY: $_";
294		}
295
296		@ids = split(/;/, $ids);
297		foreach $xid (@ids) {
298		    if(!defined($qid{$xid})) {
299			print STDERR "$line: EXAMPLE ID NO QRY '$xid': $_";
300		    }
301		    elsif(!defined($edamid{$xid})) {
302			if(defined($edamdat{$xid})) {
303			    print STDERR "$line: EXAMPLE ID=DAT '$xid': $_";
304			}
305			else {
306			    print STDERR "$line: EXAMPLE ID '$xid': $_";
307			    $badxid{xqid}++;
308			}
309		    }
310		    $xid{$xid}++;
311		}
312		@values = split(/;/, $values);
313		if($#values != $#ids) {
314		    print STDERR "$line: EXAMPLE $#ids ids $#values values: $_";
315		}
316		foreach $xval (@values) {
317		}
318	    }
319	    elsif($pref =~ /^EDAM/) {
320		($term, $name) = ($rest =~ /^(\d+) [\|] (.*)/);
321		if(!defined($term)) {
322		    print STDERR "$line: BAD LINE: $_";
323		}
324		elsif(defined($trueid{$term})) {
325		    $newterm = $trueid{$term};
326		    print STDERR "$line: EDAM-ALTID $trueid{$term} '$edam{$newterm}' $_";
327		}
328		elsif ($pref eq "EDAMtpc") {
329		    $edamtpc{$name} = $term;
330		    if(!defined($edam{$term})) {
331			print STDERR "$line: NOT-IN-EDAM: $_";
332		    }
333		    elsif($edamspace{$term} ne "topic") {
334			print STDERR "$line: EDAM-NAMESPACE $edamspace{$term}: $_";
335		    }
336		}
337		elsif ($pref eq "EDAMfmt") {
338		    $edamfmt{$name} = $term;
339		    if(!defined($edam{$term})) {
340			print STDERR "$line: NOT-IN-EDAM: $_";
341		    }
342		    elsif($edamspace{$term} ne "format") {
343			print STDERR "$line: EDAM-NAMESPACE $edamspace{$term}: $_";
344		    }
345		}
346		elsif ($pref eq "EDAMdat") {
347		    $edamdat{$name} = $term;
348		    if(!defined($edam{$term})) {
349			print STDERR "$line: NOT-IN-EDAM: $_";
350		    }
351		    elsif($edamspace{$term} ne "data") {
352			print STDERR "$line: EDAM-NAMESPACE $edamspace{$term}: $_";
353		    }
354		}
355		elsif ($pref eq "EDAMid") {
356		    $edamid{$name} = $term;
357		    if(!defined($edam{$term})) {
358			print STDERR "$line: NOT-IN-EDAM: $_";
359		    }
360		    elsif($edamspace{$term} ne "identifier") {
361			print STDERR "$line: EDAM-NAMESPACE $edamspace{$term}: $_";
362		    }
363		}
364		elsif ($pref eq "EDAMres") {
365		    print STDERR "$line: EDAM-RESOURCE $edamspace{$term}: $_";
366		}
367		if($isobs{$term}) {
368		    print STDERR "$line: EDAM-OBSOLETE: $_";
369		}
370		if(defined($edam{$term}) && ($edam{$term} ne $name)) {
371		    print STDERR "$line: EDAM-NAME '$edam{$term}': $_";
372		}
373	    }
374	}
375    }
376    if($keep) {print DRNEW}
377}
378
379close DRCAT;
380checkid();
381
382sub numqid()
383{
384    my $ret = ($badqid{$a} <=> $badqid{$b});
385    if(!$ret) {$ret = $a cmp $b};
386    $ret;
387}
388sub numxqid()
389{
390    my $ret = ($noexample{$a} <=> $noexample{$b});
391    if(!$ret) {$ret = $a cmp $b};
392    $ret;
393}
394print STDERR "\nSummary:\n\n";
395print STDERR "\nUndefined query id:\n\n";
396foreach $b (sort numqid (keys(%badqid))) {
397    printf STDERR "%6d %s\n", $badqid{$b}, $b;
398}
399print STDERR "\nNo example for query id:\n\n";
400foreach $qi (sort numxqid (keys(%noexample))) {
401    printf STDERR "%6d %s\n", $noexample{$qi}, $qi;
402}
403
404foreach $x(sort(keys(%catac))) {
405    if(!defined($xrac{$x})) {
406	print STDERR "DB-BADAC: $x $catac{$x}\n";
407    }
408}
409foreach $x(sort(keys(%xrac))) {
410    if(!defined($catac{$x})) {
411	print STDERR "NEWDB-AC: $x $xrac{$x}\n";
412    }
413}
414foreach $x(sort(keys(%xrid))) {
415    if(!defined($catid{$x})) {
416	print STDERR "NEWDB-ID: $xrid{$x} $x\n";
417    }
418}
419