1 /*
2  * url.c
3  *
4  * doi_to_url()
5  * Handle outputing DOI as a URL (Endnote and RIS formats)
6  *     1) Append https://doi.org as necessary
7  *     2) Check for overlap with pre-existing URL for the DOI
8  *
9  * is_doi()
10  * Check for DOI buried in another field.
11  *
12  * Copyright (c) Chris Putnam 2008-2020
13  *
14  * Source code released under the GPL version 2
15  *
16  */
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <ctype.h>
21 #include "bibutils.h"
22 #include "url.h"
23 
24 static void
construct_url(char * prefix,str * id,str * id_url,char sep)25 construct_url( char *prefix, str *id, str *id_url, char sep )
26 {
27 	if ( !strncasecmp( str_cstr( id ), "http:", 5 ) )
28 		str_strcpy( id_url, id );
29 	else {
30 		str_strcpyc( id_url, prefix );
31 		if ( sep!='\0' ) {
32 			if ( id->data[0]!=sep ) str_addchar( id_url, sep );
33 		}
34 		str_strcat( id_url, id );
35 	}
36 }
37 
38 static int
url_exists(fields * f,char * urltag,str * doi_url)39 url_exists( fields *f, char *urltag, str *doi_url )
40 {
41 	int i, n;
42 	if ( urltag ) {
43 		n = fields_num( f );
44 		for ( i=0; i<n; ++i ) {
45 			if ( strcmp( fields_tag( f, i, FIELDS_CHRP ), urltag ) ) continue;
46 			if ( strcmp( fields_value( f, i, FIELDS_CHRP ), str_cstr( doi_url ) ) ) continue;
47 			return 1;
48 		}
49 	}
50 	return 0;
51 }
52 
53 static void
xxx_to_url(fields * f,int n,char * http_prefix,char * urltag,str * xxx_url,char sep)54 xxx_to_url( fields *f, int n, char *http_prefix, char *urltag, str *xxx_url, char sep )
55 {
56 	str_empty( xxx_url );
57 	construct_url( http_prefix, fields_value( f, n, FIELDS_STRP ), xxx_url, sep );
58 	if ( url_exists( f, urltag, xxx_url ) )
59 		str_empty( xxx_url );
60 }
61 void
doi_to_url(fields * f,int n,char * urltag,str * url)62 doi_to_url( fields *f, int n, char *urltag, str *url )
63 {
64 	xxx_to_url( f, n, "https://doi.org", urltag, url, '/' );
65 }
66 void
jstor_to_url(fields * f,int n,char * urltag,str * url)67 jstor_to_url( fields *f, int n, char *urltag, str *url )
68 {
69 	xxx_to_url( f, n, "http://www.jstor.org/stable", urltag, url, '/' );
70 }
71 void
pmid_to_url(fields * f,int n,char * urltag,str * url)72 pmid_to_url( fields *f, int n, char *urltag, str *url )
73 {
74 	xxx_to_url( f, n, "http://www.ncbi.nlm.nih.gov/pubmed", urltag, url, '/' );
75 }
76 void
pmc_to_url(fields * f,int n,char * urltag,str * url)77 pmc_to_url( fields *f, int n, char *urltag, str *url )
78 {
79 	xxx_to_url( f, n, "http://www.ncbi.nlm.nih.gov/pmc/articles", urltag, url, '/' );
80 }
81 void
arxiv_to_url(fields * f,int n,char * urltag,str * url)82 arxiv_to_url( fields *f, int n, char *urltag, str *url )
83 {
84 	xxx_to_url( f, n, "http://arxiv.org/abs", urltag, url, '/' );
85 }
86 void
mrnumber_to_url(fields * f,int n,char * urltag,str * url)87 mrnumber_to_url( fields *f, int n, char *urltag, str *url )
88 {
89 	xxx_to_url( f, n, "http://www.ams.org/mathscinet-getitem?mr=", urltag, url, '\0' );
90 }
91 
92 /* Rules for the pattern:
93  *   '#' = number
94  *   isalpha() = match precisely (matchcase==1) or match regardless of case
95  *   	(matchcase==0)
96  *   all others must match precisely
97  */
98 static int
string_pattern(char * s,char * pattern,int matchcase)99 string_pattern( char *s, char *pattern, int matchcase )
100 {
101 	int patlen, match, i;
102 	patlen = strlen( pattern );
103 	if ( strlen( s ) < patlen ) return 0; /* too short */
104 	for ( i=0; i<patlen; ++i ) {
105 		match = 0;
106 		if ( pattern[i]=='#' ) {
107 			if ( isdigit( (unsigned char)s[i] ) ) match = 1;
108 		} else if ( !matchcase && isalpha( (unsigned char)pattern[i] ) ) {
109 			if ( tolower((unsigned char)pattern[i])==tolower((unsigned char)s[i])) match = 1;
110 		} else {
111 			if ( pattern[i] == s[i] ) match = 1;
112 		}
113 		if ( !match ) return 0;
114 	}
115 	return 1;
116 }
117 
118 /* science direct is now doing "M3  - doi: DOI: 10.xxxx/xxxxx" */
119 /* elsevier is doing "DO - https://doi.org/xx.xxxx/xxxx..." */
120 int
is_doi(char * s)121 is_doi( char *s )
122 {
123 	if ( string_pattern( s, "##.####/", 0 ) ) return 0;
124 	if ( string_pattern( s, "doi:##.####/", 0 ) ) return 4;
125 	if ( string_pattern( s, "doi: ##.####/", 0 ) ) return 5;
126 	if ( string_pattern( s, "doi: DOI: ##.####/", 0 ) ) return 10;
127 	if ( string_pattern( s, "https://doi.org/##.####/", 0 ) ) return 16;
128 	return -1;
129 }
130 
131 /* determine if string has the header of a Universal Resource Identifier
132  *
133  * returns -1, if not true
134  * returns offset that skips over the URI scheme, if true
135  */
136 int
is_uri_remote_scheme(char * p)137 is_uri_remote_scheme( char *p )
138 {
139 	char *scheme[]   = { "http:", "https:", "ftp:", "git:", "gopher:" };
140 	int  schemelen[] = { 5,       6,        4,      4,      7         };
141         int i, nschemes = sizeof( scheme ) / sizeof( scheme[0] );
142         for ( i=0; i<nschemes; ++i ) {
143                 if ( !strncasecmp( p, scheme[i], schemelen[i] ) ) return schemelen[i];
144         }
145         return -1;
146 }
147 
148 int
is_reference_database(char * p)149 is_reference_database( char *p )
150 {
151 	char *scheme[]   = { "arXiv:", "pubmed:", "medline:", "isi:" };
152 	int  schemelen[] = { 6,        7,         8,          4      };
153         int i, nschemes = sizeof( scheme ) / sizeof( scheme[0] );
154         for ( i=0; i<nschemes; ++i ) {
155                 if ( !strncasecmp( p, scheme[i], schemelen[i] ) ) return schemelen[i];
156         }
157         return -1;
158 }
159 
160 /* many fields have been abused to embed URLs, DOIs, etc. */
161 int
is_embedded_link(char * s)162 is_embedded_link( char *s )
163 {
164 	if ( is_uri_remote_scheme( s )  != -1 ) return 1;
165 	if ( is_reference_database( s ) != -1 ) return 1;
166 	if ( is_doi( s ) !=-1 ) return 1;
167 	return 0;
168 }
169 
170 typedef struct url_t {
171 	char *tag;
172 	char *prefix;
173 	int offset;
174 } url_t;
175 
176 static url_t prefixes[] = {
177 	/*              00000000001111111112222222222333333333344444444445 */
178 	/*              12345678901234567890123456789012345678901234567890 */
179 	{ "ARXIV",     "http://arxiv.org/abs/",                     21 },
180 	{ "DOI",       "https://doi.org/",                          16 },
181 	{ "JSTOR",     "http://www.jstor.org/stable/",              28 },
182 	{ "MRNUMBER",  "http://www.ams.org/mathscinet-getitem?mr=", 41 },
183 	{ "PMID",      "http://www.ncbi.nlm.nih.gov/pubmed/",       35 },
184 	{ "PMC",       "http://www.ncbi.nlm.nih.gov/pmc/articles/", 41 },
185 	{ "ISIREFNUM", "isi:",                                       4 },
186 };
187 static int nprefixes = sizeof( prefixes ) / sizeof( prefixes[0] );
188 
189 /* do not add, but recognize */
190 static url_t extraprefixes[] = {
191 	/*              00000000001111111112222222222333333333344444444445 */
192 	/*              12345678901234567890123456789012345678901234567890 */
193 	{ "ARXIV",     "arXiv:",                                     6 },
194 	{ "DOI",       "http://dx.doi.org/",                        18 },
195 	{ "JSTOR",     "jstor:",                                     6 },
196 	{ "PMID",      "pmid:",                                      5 },
197 	{ "PMID",      "pubmed:",                                    7 },
198 	{ "PMC",       "pmc:",                                       4 },
199 	{ "URL",       "\\urllink",                                  8 },
200 	{ "URL",       "\\url",                                      4 },
201 };
202 static int nextraprefixes = sizeof( extraprefixes ) / sizeof( extraprefixes[0] );
203 
204 static int
find_prefix(const char * s,url_t * p,int np)205 find_prefix( const char *s, url_t *p, int np )
206 {
207 	int i;
208 
209 	if ( s ) {
210 		for ( i=0; i<np; ++i )
211 			if ( !strncmp( p[i].prefix, s, p[i].offset ) ) return i;
212 	}
213 
214 	return -1;
215 }
216 
217 int
urls_split_and_add(char * value_in,fields * out,int lvl_out)218 urls_split_and_add( char *value_in, fields *out, int lvl_out )
219 {
220 	int n, fstatus, status = BIBL_OK;
221 	char *tag = "URL";
222 	int offset = 0;
223 
224 	n = find_prefix( value_in, prefixes, nprefixes );
225 	if ( n!=-1 ) {
226 		tag    = prefixes[n].tag;
227 		offset = prefixes[n].offset;
228 	} else {
229 		n = find_prefix( value_in, extraprefixes, nextraprefixes );
230 		if ( n!=-1 ) {
231 			tag    = extraprefixes[n].tag;
232 			offset = extraprefixes[n].offset;
233 		}
234 	}
235 
236 	fstatus = fields_add( out, tag, &(value_in[offset]), lvl_out );
237 	if ( fstatus!=FIELDS_OK ) status = BIBL_ERR_MEMERR;
238 
239 	return status;
240 }
241 
242 /* urls_add_type()
243  *
244  * Append urls of a specific type with a specific prefix (which can be empty).
245  * We don't allow duplications here.
246  *
247  */
248 static int
urls_merge_and_add_type(fields * out,char * tag_out,int lvl_out,char * prefix,vplist * values)249 urls_merge_and_add_type( fields *out, char *tag_out, int lvl_out, char *prefix, vplist *values )
250 {
251 	int fstatus, status = BIBL_OK;
252 	vplist_index i;
253 	str url;
254 
255 	str_init( &url );
256 
257 	for ( i=0; i<values->n; ++i ) {
258 		str_strcpyc( &url, prefix );
259 		str_strcatc( &url, ( char * ) vplist_get( values, i ) );
260 		fstatus = fields_add( out, tag_out, str_cstr( &url ), lvl_out );
261 		if ( fstatus!=FIELDS_OK ) {
262 			status = BIBL_ERR_MEMERR;
263 			goto out;
264 		}
265 
266 	}
267 out:
268 	str_free( &url );
269 	return status;
270 }
271 
272 /*
273  * urls_merge_and_add()
274  *
275  * Append urls of types controlled by the list type and automatically append appropriate
276  * prefixes. If no prefix is found for the entry, don't add one (e.g. "URL" entries).
277  *
278  * Control of the types to be added by list type is necessary as some reference formats
279  * like bibtex ought to do special things with DOI, ARXIV, MRNUMBER, and the like.
280  */
281 int
urls_merge_and_add(fields * in,int lvl_in,fields * out,char * tag_out,int lvl_out,slist * types)282 urls_merge_and_add( fields *in, int lvl_in, fields *out, char *tag_out, int lvl_out, slist *types )
283 {
284 	int i, j, status = BIBL_OK;
285 	char *tag, *prefix, *empty="";
286 	vplist a;
287 
288 	vplist_init( &a );
289 
290 	for ( i=0; i<types->n; ++i ) {
291 
292 		tag = slist_cstr( types, i );
293 
294 		/* ...look for data of requested type; if not found skip */
295 		vplist_empty( &a );
296 		fields_findv_each( in, lvl_in, FIELDS_CHRP, &a, tag );
297 		if ( a.n==0 ) continue;
298 
299 		/* ...find the prefix (if present) */
300 		prefix = empty;
301 		for ( j=0; j<nprefixes; ++j ) {
302 			if ( !strcmp( prefixes[j].tag, tag ) ) {
303 				prefix = prefixes[j].prefix;
304 				break; /* take the first prefix in the list */
305 			}
306 		}
307 
308 		/* ...append all data of this type */
309 		status = urls_merge_and_add_type( out, tag_out, lvl_out, prefix, &a );
310 		if ( status!=BIBL_OK ) goto out;
311 	}
312 
313 out:
314 	vplist_free( &a );
315 
316 	return status;
317 }
318