1 /* 2 Copyright (c) 2006 - 2021 3 CLST - Radboud University 4 ILK - Tilburg University 5 6 This file is part of libfolia 7 8 libfolia is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or 11 (at your option) any later version. 12 13 libfolia is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, see <http://www.gnu.org/licenses/>. 20 21 For questions and suggestions, see: 22 https://github.com/LanguageMachines/ticcutils/issues 23 or send mail to: 24 lamasoftware (at ) science.ru.nl 25 */ 26 #include <cassert> 27 #include <cstdlib> 28 #include <sys/types.h> 29 #include <sys/socket.h> 30 #include <unistd.h> 31 #include <netdb.h> 32 #include <iostream> 33 #include <fstream> 34 #include <string> 35 #include <stdexcept> 36 #include "ticcutils/PrettyPrint.h" 37 #include "ticcutils/XMLtools.h" 38 #include "ticcutils/StringOps.h" 39 #include "libfolia/folia.h" 40 #include "libfolia/folia_provenance.h" 41 42 using namespace std; 43 using namespace icu; 44 45 namespace folia { 46 using TiCC::operator<<; 47 print(ostream & os,const int indent) const48 void processor::print( ostream& os, const int indent ) const { 49 /// print a complete processor instance to a stream (Debugging purposes) 50 /*! 51 \param os The output stream 52 \param indent The identation (in spaces) to use 53 */ 54 string space = string(indent,'\t'); 55 os << space << "name=" << _name << endl; 56 os << space << "id=" << _id << endl; 57 os << space << "version=" << _version << endl; 58 os << space << "type=" << TiCC::toString(_type) << endl; 59 os << space << "folia_version=" << _folia_version << endl; 60 os << space << "document_version=" << _document_version << endl; 61 os << space << "command=" << _command << endl; 62 os << space << "host=" << _host << endl << endl; 63 for( const auto& c : _processors ){ 64 c->print( os, indent+1 ); 65 } 66 } 67 operator <<(ostream & os,const processor & p)68 ostream& operator<<( ostream& os, const processor& p ){ 69 /// output a processor 70 /*! 71 \param os the output stream 72 \param p the processor 73 */ 74 p.print( os, 0 ); 75 return os; 76 } 77 operator <<(ostream & os,const processor * p)78 ostream& operator<<( ostream& os, const processor *p ){ 79 /// output a processor 80 /*! 81 \param os the output stream 82 \param p the processor 83 */ 84 if ( p ){ 85 p->print( os, 0 ); 86 } 87 else { 88 os << "NO PROCESSOR"; 89 } 90 return os; 91 } 92 getfqdn()93 string getfqdn( ){ 94 /// function to get the hostname of the machine we are running on 95 /*! 96 \return a string with the hostname 97 */ 98 string result; 99 struct addrinfo hints, *info, *p; 100 int gai_result; 101 102 char hostname[1024]; 103 hostname[1023] = '\0'; 104 gethostname(hostname, 1023); 105 106 memset(&hints, 0, sizeof hints); 107 hints.ai_family = AF_UNSPEC; /*either IPV4 or IPV6*/ 108 hints.ai_socktype = SOCK_STREAM; 109 hints.ai_flags = AI_CANONNAME; 110 111 if ((gai_result = getaddrinfo(hostname, "http", &hints, &info)) != 0) { 112 cerr << "failure in getaddrinfo: " << gai_strerror(gai_result) << endl; 113 exit(1); 114 } 115 116 for ( p = info; p != NULL; p = p->ai_next ) { 117 result = p->ai_canonname; 118 break; 119 } 120 freeaddrinfo(info); 121 return result; 122 } 123 get_user()124 string get_user(){ 125 /// function to get the username of the program 126 /*! 127 \return a string with the username 128 */ 129 string result; 130 const char *env = getenv( "USER" ); 131 if ( env ){ 132 result = env; 133 } 134 return result; 135 } 136 get_system_defaults()137 void processor::get_system_defaults(){ 138 /// set the sytem information in this processor 139 /*! 140 will set the hostname, the username, the current time and the FoLiA 141 version 142 */ 143 _host = getfqdn(); 144 _begindatetime = get_ISO_date(); 145 _folia_version = folia::folia_version(); 146 _user = get_user(); 147 } 148 149 //#define PROC_DEBUG 150 generate_id(Provenance * prov,const string & name)151 string processor::generate_id( Provenance *prov, 152 const string& name ){ 153 /// generate an processor id 154 /*! 155 \param prov the provenance data context 156 \param name use this name as base for the id 157 \return the new id 158 159 First we lookup \em name in the Provenance \em prov. If it is found 160 we generate a new id as sub-id of the name. When not found, we just create 161 a new id \e 'name.1' 162 163 Some care is taken to make sure NO existing id is generated, when this 164 would happen we add extra '_' characters to name 165 */ 166 string new_id; 167 auto it = prov->_names.find(name); 168 if ( it == prov->_names.end() ){ 169 #ifdef PROC_DEBUG 170 cerr << "generate_id, " << name << " not found in " <<prov->_names << endl; 171 #endif 172 if ( !isNCName(name) ){ 173 throw XmlError( "generated_id: '" + name 174 + "' is not a valid base for an NCName." ); 175 } 176 prov->_names[name].insert(1); 177 new_id = name + ".1"; 178 } 179 else { 180 #ifdef PROC_DEBUG 181 cerr << "generate_id, " << name << " found " << endl; 182 #endif 183 int val = *(it->second.rbegin()); 184 #ifdef PROC_DEBUG 185 cerr << "generate_id, val=" << val << endl; 186 #endif 187 prov->_names[name].insert(++val); 188 #ifdef PROC_DEBUG 189 cerr << "generate_id, ++val=" << val << endl; 190 #endif 191 new_id = name + "." + TiCC::toString(val); 192 } 193 if ( prov->get_processor_by_id(new_id) != 0 ){ 194 #ifdef PROC_DEBUG 195 cerr << "generate_id, id=" << new_id << " exists, loop!" << endl; 196 #endif 197 // oops creating an existing one. Not good 198 return generate_id( prov, name + "_1" ); 199 } 200 return new_id; 201 } 202 calculate_next_id()203 string processor::calculate_next_id(){ 204 /// create a successor id for this processor 205 /*! 206 \return the new id 207 208 When the processor has subprocessors, we create an id which is 1 beyond 209 that of the last subprocessor 210 211 Otherwise we create an id for the first subprocessor 212 */ 213 string new_id; 214 if ( !sub_processors().empty() ){ 215 string prev_id = sub_processors().back()->id(); 216 vector<string> v = TiCC::split_at( prev_id, "." ); 217 int val; 218 if ( TiCC::stringTo( v.back(), val ) ){ 219 v.back() = TiCC::toString(++val); 220 } 221 else { 222 // not a number, just add .1 then, and pray 223 v.back() += ".1"; 224 } 225 for ( const auto& it : v ){ 226 new_id += it + "."; 227 } 228 new_id.pop_back(); 229 } 230 else { 231 new_id = id() + ".1"; 232 } 233 return new_id; 234 } 235 236 processor(Provenance * prov,processor * parent,const KWargs & atts_in)237 processor::processor( Provenance *prov, 238 processor* parent, 239 const KWargs& atts_in ) { 240 /// initialize a processor 241 /*! 242 \param prov The provenance context 243 \param parent A parent to connect to 244 \param atts_in A KWargs list with values to set for the processor 245 */ 246 _type = AUTO; 247 KWargs atts = atts_in; 248 string name = atts.extract("name"); 249 if ( name.empty() ){ 250 throw XmlError( "processor: missing 'name' attribute" ); 251 } 252 else { 253 _name = name; 254 } 255 #ifdef PROC_DEBUG 256 cerr << "new processor(" << atts_in << ")" << endl; 257 #endif 258 string id = atts.extract("id"); 259 if ( id.empty() ){ 260 id = atts.extract("xml:id"); 261 } 262 if ( id.empty() ){ 263 string gen = atts.extract("generate_id"); 264 if ( gen.empty() ){ 265 throw XmlError( "processor: missing 'xml:id' attribute" ); 266 } 267 #ifdef PROC_DEBUG 268 cerr << "new processor generate_id() gen==" << gen << endl; 269 #endif 270 if ( gen == "auto()" ){ 271 id = generate_id( prov, _name ); 272 #ifdef PROC_DEBUG 273 cerr << "new processor generate_id(" << _name << ") ==>" << id << endl; 274 #endif 275 } 276 else if ( gen == "next()" ){ 277 if ( !parent ){ 278 // fall back to auto() 279 id = generate_id( prov, _name ); 280 // throw invalid_argument( "processor id=next() impossible. No parent" ); 281 } 282 else { 283 id = parent->calculate_next_id(); 284 } 285 #ifdef PROC_DEBUG 286 cerr << "new processor calculate_next() ==>" << id << endl; 287 #endif 288 } 289 else { 290 id = generate_id( prov, gen ); 291 #ifdef PROC_DEBUG 292 cerr << "new processor generate_id(" << gen << ") ==>" << id << endl; 293 #endif 294 } 295 } 296 else if ( id == "next()" ){ 297 if ( !parent ){ 298 // fall back to auto() 299 id = generate_id( prov, _name ); 300 // throw invalid_argument( "processor id=next() impossible. No parent" ); 301 } 302 else { 303 id = parent->calculate_next_id(); 304 } 305 #ifdef PROC_DEBUG 306 cerr << "new processor calculate SPECIAAL() ==>" << id << endl; 307 #endif 308 } 309 processor *check = prov->get_processor_by_id( id ); 310 if ( check ){ 311 throw DuplicateIDError( "processor '" + id + "' already exists" ); 312 } 313 _id = id; 314 for ( const auto& att : atts ){ 315 if ( att.first == "begindatetime" ){ 316 if ( att.second == "now()" ){ 317 _begindatetime = get_ISO_date(); 318 } 319 else { 320 _begindatetime = att.second; 321 } 322 } 323 else if ( att.first == "enddatetime" ){ 324 if ( att.second == "now()" ){ 325 _enddatetime = get_ISO_date(); 326 } 327 else { 328 _enddatetime = att.second; 329 } 330 } 331 else if ( att.first == "version" ){ 332 _version = att.second; 333 } 334 else if ( att.first == "document_version" ){ 335 _document_version = att.second; 336 } 337 else if ( att.first == "command" ){ 338 _command = att.second; 339 } 340 else if ( att.first == "folia_version" ){ 341 _folia_version = att.second; 342 } 343 else if ( att.first == "type" ){ 344 try { 345 _type = TiCC::stringTo<AnnotatorType>( att.second ); 346 } 347 catch (...){ 348 throw XmlError( "processor: invalid value for 'type' attribute: " 349 + att.second ); 350 } 351 } 352 else if ( att.first == "host" ){ 353 _host = att.second; 354 } 355 else if ( att.first == "resourcelink" ){ 356 _resourcelink = att.second; 357 } 358 else if ( att.first == "user" ){ 359 _user = att.second; 360 } 361 else if ( att.first == "src" ){ 362 _src = att.second; 363 } 364 else if ( att.first == "format" ){ 365 _format = att.second; 366 } 367 else if ( att.first == "generator" ){ 368 // we automagicly add a subprocessor. 369 KWargs g_atts; 370 g_atts["folia_version"] = folia::folia_version(); 371 g_atts["version"] = library_version(); 372 g_atts["type"] = "GENERATOR"; 373 g_atts["id"] = _id + ".generator"; 374 g_atts["name"] = "libfolia"; 375 processor *sub = new processor( prov, this, g_atts ); 376 this->_processors.push_back( sub ); 377 } 378 } 379 prov->add_to_index(this); 380 } 381 ~processor()382 processor::~processor(){ 383 /// deconstructor for a processor and its subprocessors 384 for ( const auto& p : _processors ){ 385 delete p; 386 } 387 } 388 set_metadata(const string & id,const string & val)389 bool processor::set_metadata( const string& id, 390 const string& val ){ 391 /// set a metadata property in the processor 392 /*! 393 \param id the name of the property 394 \param val the value to set 395 \return true when set, false when already set 396 */ 397 if ( _metadata[id].empty() ){ 398 _metadata[id] = val; 399 return true; 400 } 401 else { 402 return false; 403 } 404 } 405 get_metadata(const string & id)406 string processor::get_metadata( const string& id ){ 407 /// get a metadata property from the processor 408 /*! 409 \param id the name of the property to return 410 \return the value when found or "" when not found 411 */ 412 auto it = _metadata.find( id ); 413 if ( it != _metadata.end() ){ 414 return it->second; 415 } 416 return ""; 417 } 418 ~Provenance()419 Provenance::~Provenance(){ 420 /// deconstruct this provenance context and it's processors 421 for ( const auto& p : processors ){ 422 delete p; 423 } 424 } 425 get_processor_by_id(const string & id) const426 processor *Provenance::get_processor_by_id( const string& id ) const { 427 /// return a processor with the given id 428 /*! 429 \param id the processor id we search for 430 \return the found processor or 0 when not found 431 */ 432 const auto& p = _index.find( id ); 433 if ( p != _index.end() ){ 434 return p->second; 435 } 436 else { 437 return 0; 438 } 439 } 440 get_processors_by_name(const string & name) const441 vector<processor*> Provenance::get_processors_by_name( const string& name ) const { 442 /// give a list of all processors with this name 443 /*! 444 \param name the name to search for 445 \return a list of found processors 446 447 \note processor id's are UNIQUE, processor names ARN'T 448 */ 449 vector<processor*> result; 450 for ( auto p = _name_index.lower_bound( name ); 451 p != _name_index.upper_bound( name ); 452 ++p ){ 453 result.push_back( p->second ); 454 } 455 return result; 456 } 457 get_top_processor() const458 processor *Provenance::get_top_processor() const { 459 /// return the main processor in this Provenance context 460 return _first_proc; 461 } 462 add_to_index(processor * p)463 void Provenance::add_to_index( processor *p ){ 464 /// add a procesor to the index 465 _index[p->id()] = p; 466 _name_index.insert( make_pair(p->name(),p) ); 467 if ( _first_proc == 0 ){ 468 _first_proc = p; 469 } 470 } 471 parse_processor(const xmlNode * node,processor * parent)472 void Provenance::parse_processor( const xmlNode *node, 473 processor *parent ) { 474 /// parse a processor from XML 475 /*! 476 \param node the xmlNode whre the processor is found 477 \param parent the processor to connect to (may be 0) 478 */ 479 KWargs node_atts = getAttributes( node ); 480 processor *main = new processor( this, parent, node_atts ); 481 if ( parent ){ 482 parent->_processors.push_back( main ); 483 } 484 else { 485 processors.push_back( main ); 486 } 487 // cerr << "created procesor(" << node_atts << ")" << endl; 488 xmlNode *n = node->children; 489 while ( n ){ 490 string tag = TiCC::Name( n ); 491 if ( tag == "processor" ){ 492 parse_processor(n,main); 493 } 494 else if ( tag == "meta" ){ 495 KWargs atts = getAttributes( n ); 496 string id = atts["id"]; 497 if ( id.empty() ){ 498 throw XmlError( "processor: missing 'id' for meta tag" ); 499 } 500 if ( atts.size() != 1 ){ 501 throw XmlError( "processor: invalid attribute(s) in meta tag" ); 502 } 503 string value = TiCC::XmlContent( n ); 504 main->_metadata[id] = value; 505 } 506 n = n->next; 507 } 508 } 509 operator <<(ostream & os,const Provenance & p)510 ostream& operator<<( ostream& os, const Provenance& p ){ 511 /// output the provenance context (debugging only) 512 os << "provenance data" << endl; 513 os << "NAMES: " << p._names << endl; 514 for ( const auto& pr : p.processors ){ 515 pr->print( os, 2 ); 516 os << endl; 517 } 518 return os; 519 } 520 operator <<(ostream & os,const Provenance * p)521 ostream& operator<<( ostream& os, const Provenance* p ){ 522 /// output the provenance context (debugging only) 523 if ( p ){ 524 os << *p; 525 } 526 else { 527 os << "no provenance"; 528 } 529 return os; 530 } 531 532 } // namespace folia 533