1 /*
2   Copyright (c) 2006 - 2021
3   CLST  - Radboud University
4   ILK   - Tilburg University
5 
6   This file is part of libfolia
7 
8   libfolia is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   libfolia is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program; if not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ticcutils/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 */
26 #include <cassert>
27 #include <cstdlib>
28 #include <sys/types.h>
29 #include <sys/socket.h>
30 #include <unistd.h>
31 #include <netdb.h>
32 #include <iostream>
33 #include <fstream>
34 #include <string>
35 #include <stdexcept>
36 #include "ticcutils/PrettyPrint.h"
37 #include "ticcutils/XMLtools.h"
38 #include "ticcutils/StringOps.h"
39 #include "libfolia/folia.h"
40 #include "libfolia/folia_provenance.h"
41 
42 using namespace std;
43 using namespace icu;
44 
45 namespace folia {
46   using TiCC::operator<<;
47 
print(ostream & os,const int indent) const48   void processor::print( ostream& os, const int indent ) const {
49     /// print a complete processor instance to a stream (Debugging purposes)
50     /*!
51       \param os The output stream
52       \param indent The identation (in spaces) to use
53     */
54     string space = string(indent,'\t');
55     os << space << "name=" << _name << endl;
56     os << space << "id=" << _id << endl;
57     os << space << "version=" << _version << endl;
58     os << space << "type=" << TiCC::toString(_type) << endl;
59     os << space << "folia_version=" << _folia_version << endl;
60     os << space << "document_version=" << _document_version << endl;
61     os << space << "command=" << _command << endl;
62     os << space << "host=" << _host << endl << endl;
63     for( const auto& c : _processors ){
64       c->print( os, indent+1 );
65     }
66   }
67 
operator <<(ostream & os,const processor & p)68   ostream& operator<<( ostream& os, const processor& p ){
69     /// output a processor
70     /*!
71       \param os the output stream
72       \param p the processor
73     */
74     p.print( os, 0 );
75     return os;
76   }
77 
operator <<(ostream & os,const processor * p)78   ostream& operator<<( ostream& os, const processor *p ){
79     /// output a processor
80     /*!
81       \param os the output stream
82       \param p the processor
83     */
84     if ( p ){
85       p->print( os, 0 );
86     }
87     else {
88       os << "NO PROCESSOR";
89     }
90     return os;
91   }
92 
getfqdn()93   string getfqdn( ){
94     /// function to get the hostname of the machine we are running on
95     /*!
96       \return a string with the hostname
97     */
98     string result;
99     struct addrinfo hints, *info, *p;
100     int gai_result;
101 
102     char hostname[1024];
103     hostname[1023] = '\0';
104     gethostname(hostname, 1023);
105 
106     memset(&hints, 0, sizeof hints);
107     hints.ai_family = AF_UNSPEC; /*either IPV4 or IPV6*/
108     hints.ai_socktype = SOCK_STREAM;
109     hints.ai_flags = AI_CANONNAME;
110 
111     if ((gai_result = getaddrinfo(hostname, "http", &hints, &info)) != 0) {
112       cerr << "failure in getaddrinfo: " << gai_strerror(gai_result) << endl;
113       exit(1);
114     }
115 
116     for ( p = info; p != NULL; p = p->ai_next ) {
117       result = p->ai_canonname;
118       break;
119     }
120     freeaddrinfo(info);
121     return result;
122   }
123 
get_user()124   string get_user(){
125     /// function to get the username of the program
126     /*!
127       \return a string with the username
128     */
129     string result;
130     const char *env = getenv( "USER" );
131     if ( env ){
132       result = env;
133     }
134     return result;
135   }
136 
get_system_defaults()137   void processor::get_system_defaults(){
138     /// set the sytem information in this processor
139     /*!
140       will set the hostname, the username, the current time and the FoLiA
141       version
142     */
143     _host = getfqdn();
144     _begindatetime = get_ISO_date();
145     _folia_version = folia::folia_version();
146     _user = get_user();
147   }
148 
149   //#define PROC_DEBUG
150 
generate_id(Provenance * prov,const string & name)151   string processor::generate_id( Provenance *prov,
152 				 const string& name ){
153     /// generate an processor id
154     /*!
155       \param prov the provenance data context
156       \param name use this name as base for the id
157       \return the new id
158 
159       First we lookup \em name in the Provenance \em prov. If it is found
160       we generate a new id as sub-id of the name. When not found, we just create
161       a new id \e 'name.1'
162 
163       Some care is taken to make sure NO existing id is generated, when this
164       would happen we add extra '_' characters to name
165     */
166     string new_id;
167     auto it = prov->_names.find(name);
168     if ( it == prov->_names.end() ){
169 #ifdef PROC_DEBUG
170       cerr << "generate_id, " << name << " not found in " <<prov->_names << endl;
171 #endif
172       if ( !isNCName(name) ){
173 	throw XmlError( "generated_id: '" + name
174 			+ "' is not a valid base for an NCName." );
175       }
176       prov->_names[name].insert(1);
177       new_id = name + ".1";
178     }
179     else {
180 #ifdef PROC_DEBUG
181       cerr << "generate_id, " << name << " found " << endl;
182 #endif
183       int val = *(it->second.rbegin());
184 #ifdef PROC_DEBUG
185       cerr << "generate_id, val=" << val << endl;
186 #endif
187       prov->_names[name].insert(++val);
188 #ifdef PROC_DEBUG
189       cerr << "generate_id, ++val=" << val << endl;
190 #endif
191       new_id = name + "." + TiCC::toString(val);
192     }
193     if ( prov->get_processor_by_id(new_id) != 0 ){
194 #ifdef PROC_DEBUG
195       cerr << "generate_id, id=" << new_id << " exists, loop!" << endl;
196 #endif
197       // oops creating an existing one. Not good
198       return generate_id( prov, name + "_1" );
199     }
200     return new_id;
201   }
202 
calculate_next_id()203   string processor::calculate_next_id(){
204     /// create a successor id for this processor
205     /*!
206       \return the new id
207 
208       When the processor has subprocessors, we create an id which is 1 beyond
209       that of the last subprocessor
210 
211       Otherwise we create an id for the first subprocessor
212     */
213     string new_id;
214     if ( !sub_processors().empty() ){
215       string prev_id = sub_processors().back()->id();
216       vector<string> v = TiCC::split_at( prev_id, "." );
217       int val;
218       if ( TiCC::stringTo( v.back(), val ) ){
219 	v.back() = TiCC::toString(++val);
220       }
221       else {
222 	// not a number, just add .1 then, and pray
223 	v.back() += ".1";
224       }
225       for ( const auto& it :  v ){
226 	new_id += it + ".";
227       }
228       new_id.pop_back();
229     }
230     else {
231       new_id = id() + ".1";
232     }
233     return new_id;
234   }
235 
236 
processor(Provenance * prov,processor * parent,const KWargs & atts_in)237   processor::processor( Provenance *prov,
238 			processor* parent,
239 			const KWargs& atts_in ) {
240     /// initialize a processor
241     /*!
242       \param prov The provenance context
243       \param parent A parent to connect to
244       \param atts_in A KWargs list with values to set for the processor
245     */
246     _type = AUTO;
247     KWargs atts = atts_in;
248     string name = atts.extract("name");
249     if ( name.empty() ){
250       throw XmlError( "processor: missing 'name' attribute" );
251     }
252     else {
253       _name = name;
254     }
255 #ifdef PROC_DEBUG
256     cerr << "new processor(" << atts_in << ")" << endl;
257 #endif
258     string id = atts.extract("id");
259     if ( id.empty() ){
260       id = atts.extract("xml:id");
261     }
262     if ( id.empty() ){
263       string gen = atts.extract("generate_id");
264       if ( gen.empty() ){
265 	throw XmlError( "processor: missing 'xml:id' attribute" );
266       }
267 #ifdef PROC_DEBUG
268       cerr << "new processor generate_id() gen==" << gen << endl;
269 #endif
270       if ( gen == "auto()" ){
271 	id = generate_id( prov, _name );
272 #ifdef PROC_DEBUG
273 	cerr << "new processor generate_id(" << _name << ") ==>" << id << endl;
274 #endif
275       }
276       else if ( gen == "next()" ){
277 	if ( !parent ){
278 	  // fall back to auto()
279 	  id = generate_id( prov, _name );
280 	  //	throw invalid_argument( "processor id=next() impossible. No parent" );
281 	}
282 	else {
283 	  id = parent->calculate_next_id();
284 	}
285 #ifdef PROC_DEBUG
286 	cerr << "new processor calculate_next() ==>" << id << endl;
287 #endif
288       }
289       else {
290 	id = generate_id( prov, gen );
291 #ifdef PROC_DEBUG
292 	cerr << "new processor generate_id(" << gen << ") ==>" << id << endl;
293 #endif
294       }
295     }
296     else if ( id == "next()" ){
297       if ( !parent ){
298 	// fall back to auto()
299 	id = generate_id( prov, _name );
300 	//	throw invalid_argument( "processor id=next() impossible. No parent" );
301       }
302       else {
303 	id = parent->calculate_next_id();
304       }
305 #ifdef PROC_DEBUG
306       cerr << "new processor calculate SPECIAAL() ==>" << id << endl;
307 #endif
308     }
309     processor *check = prov->get_processor_by_id( id );
310     if ( check ){
311       throw DuplicateIDError( "processor '" + id + "' already exists" );
312     }
313     _id = id;
314     for ( const auto& att : atts ){
315       if ( att.first == "begindatetime" ){
316 	if ( att.second == "now()" ){
317 	  _begindatetime = get_ISO_date();
318 	}
319 	else {
320 	  _begindatetime = att.second;
321 	}
322       }
323       else if ( att.first == "enddatetime" ){
324 	if ( att.second == "now()" ){
325 	  _enddatetime = get_ISO_date();
326 	}
327 	else {
328 	  _enddatetime = att.second;
329 	}
330       }
331       else if ( att.first == "version" ){
332 	_version = att.second;
333       }
334       else if ( att.first == "document_version" ){
335 	_document_version = att.second;
336       }
337       else if ( att.first == "command" ){
338 	_command = att.second;
339       }
340       else if ( att.first == "folia_version" ){
341 	_folia_version = att.second;
342       }
343       else if ( att.first == "type" ){
344 	try {
345 	  _type = TiCC::stringTo<AnnotatorType>( att.second );
346 	}
347 	catch (...){
348 	  throw XmlError( "processor: invalid value for 'type' attribute: "
349 			  + att.second );
350 	}
351       }
352       else if ( att.first == "host" ){
353 	_host = att.second;
354       }
355       else if ( att.first == "resourcelink" ){
356 	_resourcelink = att.second;
357       }
358       else if ( att.first == "user" ){
359 	_user = att.second;
360       }
361       else if ( att.first == "src" ){
362 	_src = att.second;
363       }
364       else if ( att.first == "format" ){
365 	_format = att.second;
366       }
367       else if ( att.first == "generator" ){
368 	// we automagicly add a subprocessor.
369 	KWargs g_atts;
370 	g_atts["folia_version"] = folia::folia_version();
371 	g_atts["version"] = library_version();
372 	g_atts["type"] = "GENERATOR";
373 	g_atts["id"] = _id + ".generator";
374 	g_atts["name"] = "libfolia";
375 	processor *sub = new processor( prov, this, g_atts );
376 	this->_processors.push_back( sub );
377       }
378     }
379     prov->add_to_index(this);
380   }
381 
~processor()382   processor::~processor(){
383     /// deconstructor for a processor and its subprocessors
384     for ( const auto& p : _processors ){
385       delete p;
386     }
387   }
388 
set_metadata(const string & id,const string & val)389   bool processor::set_metadata( const string& id,
390 				const string& val ){
391     /// set a metadata property in the processor
392     /*!
393       \param id the name of the property
394       \param val the value to set
395       \return true when set, false when already set
396     */
397     if ( _metadata[id].empty() ){
398       _metadata[id] = val;
399       return true;
400     }
401     else {
402       return false;
403     }
404   }
405 
get_metadata(const string & id)406   string processor::get_metadata( const string& id ){
407     /// get a metadata property from the processor
408     /*!
409       \param id the name of the property to return
410       \return the value when found or "" when not found
411     */
412     auto it = _metadata.find( id );
413     if ( it != _metadata.end() ){
414       return it->second;
415     }
416     return "";
417   }
418 
~Provenance()419   Provenance::~Provenance(){
420     /// deconstruct this provenance context and it's processors
421     for ( const auto& p : processors ){
422       delete p;
423     }
424   }
425 
get_processor_by_id(const string & id) const426   processor *Provenance::get_processor_by_id( const string& id ) const {
427     ///  return a processor with the given id
428     /*!
429       \param id the processor id we search for
430       \return the found processor or 0 when not found
431     */
432     const auto& p = _index.find( id );
433     if ( p != _index.end() ){
434       return p->second;
435     }
436     else {
437       return 0;
438     }
439   }
440 
get_processors_by_name(const string & name) const441   vector<processor*> Provenance::get_processors_by_name( const string& name ) const {
442     /// give a list of all processors with this name
443     /*!
444       \param name the name to search for
445       \return a list of found processors
446 
447       \note processor id's are UNIQUE, processor names ARN'T
448     */
449     vector<processor*> result;
450     for ( auto p = _name_index.lower_bound( name );
451 	  p !=  _name_index.upper_bound( name );
452 	  ++p ){
453       result.push_back( p->second );
454     }
455     return result;
456   }
457 
get_top_processor() const458   processor *Provenance::get_top_processor() const {
459     /// return the main processor in this Provenance context
460     return _first_proc;
461   }
462 
add_to_index(processor * p)463   void Provenance::add_to_index( processor *p ){
464     /// add a procesor to the index
465     _index[p->id()] = p;
466     _name_index.insert( make_pair(p->name(),p) );
467     if ( _first_proc == 0 ){
468       _first_proc = p;
469     }
470   }
471 
parse_processor(const xmlNode * node,processor * parent)472   void Provenance::parse_processor( const xmlNode *node,
473 				    processor *parent ) {
474     /// parse a processor from XML
475     /*!
476       \param node the xmlNode whre the processor is found
477       \param parent the processor to connect to (may be 0)
478      */
479     KWargs node_atts = getAttributes( node );
480     processor *main = new processor( this, parent, node_atts );
481     if ( parent ){
482       parent->_processors.push_back( main );
483     }
484     else {
485       processors.push_back( main );
486     }
487     //    cerr << "created procesor(" << node_atts << ")" << endl;
488     xmlNode *n = node->children;
489     while ( n ){
490       string tag = TiCC::Name( n );
491       if ( tag == "processor" ){
492      	parse_processor(n,main);
493       }
494       else if ( tag == "meta" ){
495 	KWargs atts = getAttributes( n );
496 	string id = atts["id"];
497 	if ( id.empty() ){
498 	  throw XmlError( "processor: missing 'id' for meta tag" );
499 	}
500 	if ( atts.size() != 1 ){
501 	  throw XmlError( "processor: invalid attribute(s) in meta tag" );
502 	}
503 	string value = TiCC::XmlContent( n );
504 	main->_metadata[id] = value;
505       }
506       n = n->next;
507     }
508   }
509 
operator <<(ostream & os,const Provenance & p)510   ostream& operator<<( ostream& os, const Provenance& p ){
511     /// output the provenance context (debugging only)
512     os << "provenance data" << endl;
513     os << "NAMES: " << p._names << endl;
514     for ( const auto& pr : p.processors ){
515       pr->print( os, 2 );
516       os << endl;
517     }
518     return os;
519   }
520 
operator <<(ostream & os,const Provenance * p)521   ostream& operator<<( ostream& os, const Provenance* p ){
522     /// output the provenance context (debugging only)
523     if ( p ){
524       os << *p;
525     }
526     else {
527       os << "no provenance";
528     }
529     return os;
530   }
531 
532 } // namespace folia
533