or or such // there may be embedded Paragraph, Word and Sentence nodes // if so, Paragraphs and Sentences should be handled separately vector sv = e->select(false); vector pv = e->select(false); if ( pv.empty() && sv.empty() ){ // just words or text string text = e->str( text_policy ); if ( tokDebug > 1 ){ LOG << "tok-" << e->xmltag() << ":" << text << endl; } tokenizeLine( text ); vector> sents; vector toks = popSentence(); while ( toks.size() > 0 ){ sents.push_back( toks ); toks = popSentence(); } if ( sents.size() == 0 ){ // can happen in very rare cases (strange spaces in the input) // SKIP! } else if ( sents.size() > 1 ){ // multiple sentences. We need an extra Paragraph. // But first check if this is allowed! folia::FoliaElement *rt; if ( e->acceptable(folia::Paragraph_t) ){ folia::KWargs args; string e_id = e->id(); if ( !e_id.empty() ){ args["generate_id"] = e_id; } folia::processor *proc = add_provenance_structure( e->doc(), folia::AnnotationType::PARAGRAPH ); if ( proc ){ args["processor"] = proc->id(); } args["set"] = e->doc()->default_set( folia::AnnotationType::PARAGRAPH ); folia::Paragraph *p = new folia::Paragraph( args, e->doc() ); e->append( p ); rt = p; } else { rt = e; } for ( const auto& sent : sents ){ folia::KWargs args; string p_id = rt->id(); if ( !p_id.empty() ){ args["generate_id"] = p_id; } folia::processor *proc = add_provenance_structure( e->doc(), folia::AnnotationType::SENTENCE ); if ( proc ){ args["processor"] = proc->id(); } args["set"] = e->doc()->default_set( folia::AnnotationType::SENTENCE ); folia::Sentence *s = new folia::Sentence( args, e->doc() ); append_to_sentence( s, sent ); ++sentence_done; if (tokDebug > 0){ LOG << "created a new sentence: " << s << endl; } rt->append( s ); } } else { // 1 sentence, connect directly. folia::KWargs args; string e_id = e->id(); if ( e_id.empty() ){ e_id = e->generateId( e->xmltag() ); args["xml:id"] = e_id + ".s.1"; } else { args["generate_id"] = e_id; } folia::processor *proc = add_provenance_structure( e->doc(), folia::AnnotationType::SENTENCE ); if ( proc ){ args["processor"] = proc->id(); } args["set"] = e->doc()->default_set( folia::AnnotationType::SENTENCE ); folia::Sentence *s = new folia::Sentence( args, e->doc() ); append_to_sentence( s, sents[0] ); ++sentence_done; if (tokDebug > 0){ LOG << "created a new sentence: " << s << endl; } e->append( s ); } } else if ( !pv.empty() ){ if ( tokDebug > 1 ){ LOG << "found some Paragraphs " << pv << endl; } // For now we only handle the Paragraphs, ignore sentences and words // IS this even valid??? for ( const auto& p : pv ){ handle_one_paragraph( p, sentence_done ); } } else { if ( tokDebug > 1 ){ LOG << "found some Sentences " << sv << endl; } // For now we just IGNORE the loose words (backward compatability) for ( const auto& s : sv ){ handle_one_sentence( s, sentence_done ); } } } if ( text_redundancy == "full" ){ appendText( e, outputclass ); } else if ( text_redundancy == "none" ){ removeText( e, outputclass ); } } folia::Document *TokenizerClass::tokenize_folia( const string& infile_name ){ if ( inputclass == outputclass && !doWordCorrection ){ LOG << "ucto: --filter=NO is automatically set. inputclass equals outputclass!" << endl; setFiltering(false); } text_policy.set_class( inputclass ); if ( !ignore_tag_hints ){ text_policy.add_handler("token", &handle_token_tag ); } folia::TextEngine proc( infile_name ); if ( passthru ){ add_provenance_passthru( proc.doc() ); } else { add_provenance_setting( proc.doc() ); } if ( tokDebug > 8){ proc.set_dbg_stream( theErrLog ); proc.set_debug( true ); } // proc.set_debug( true ); proc.setup( inputclass, true ); int sentence_done = 0; folia::FoliaElement *p = 0; folia::FoliaElement *parent = 0; while ( (p = proc.next_text_parent() ) ){ // LOG << "next text parent: " << p << endl; if ( !parent ){ parent = p->parent(); // LOG << "my parent: " << parent << endl; } if ( already_tokenized ){ ++sentence_done; } else { handle_one_text_parent( p, sentence_done ); } if ( tokDebug > 0 ){ LOG << "done with sentence " << sentence_done << endl; } if ( proc.next() ){ if ( tokDebug > 1 ){ LOG << "looping for more ..." << endl; } } } if ( text_redundancy == "full" ){ appendText( parent, outputclass ); } else if ( text_redundancy == "none" ){ removeText( parent, outputclass ); } if ( sentence_done == 0 ){ LOG << "document contains no text in the desired inputclass: " << inputclass << endl; LOG << "NO result!" << endl; return 0; } return proc.doc(true); // take the doc over from the Engine } void TokenizerClass::tokenize_folia( const string& infile_name, const string& outfile_name ){ if ( tokDebug > 0 ){ LOG << "[tokenize_folia] (" << infile_name << "," << outfile_name << ")" << endl; } folia::Document *doc = tokenize_folia( infile_name ); if ( doc ){ doc->save( outfile_name, false ); if ( tokDebug > 0 ){ LOG << "resulting FoLiA doc saved in " << outfile_name << endl; } } else { if ( tokDebug > 0 ){ LOG << "NO FoLiA doc created! " << endl; } } } UnicodeString TokenizerClass::outputTokens( const vector& tokens, const bool continued ) const { /*! \param tokens A list of Token's to display \param continued Set to true when outputTokens is invoked multiple times and it is not the first invokation this makes paragraph boundaries work over multiple calls \return A UnicodeString representing tokenized lines, including token information, when verbose mode is on. */ short quotelevel = 0; UnicodeString result; for ( const auto& token : tokens ) { UnicodeString outline; if (tokDebug >= 5){ LOG << "outputTokens: token=" << token << endl; } if ( detectPar && (token.role & NEWPARAGRAPH) && !verbose && continued ) { //output paragraph separator if ( sentenceperlineoutput ) { outline += "\n"; } else { outline += "\n\n"; } } UnicodeString s = token.us; if (lowercase) { s = s.toLower(); } else if ( uppercase ) { s = s.toUpper(); } outline += s; if ( token.role & NEWPARAGRAPH ) { quotelevel = 0; } if ( token.role & BEGINQUOTE ) { ++quotelevel; } if ( verbose ) { outline += "\t" + token.type + "\t" + toUString(token.role) + "\n"; } if ( token.role & ENDQUOTE ) { --quotelevel; } if ( token.role & ENDOFSENTENCE ) { if ( verbose ) { if ( !(token.role & NOSPACE ) ){ outline += "\n"; } } else { if ( quotelevel == 0 ) { if ( sentenceperlineoutput ) { outline += "\n"; } else { outline += " " + eosmark + " "; } if ( splitOnly ){ outline += "\n"; } } else { //inside quotation if ( splitOnly && !(token.role & NOSPACE ) ){ outline += " "; } } } } if ( ( &token != &(*tokens.rbegin()) ) && !verbose ) { if ( !( (token.role & ENDOFSENTENCE) && sentenceperlineoutput && !splitOnly ) ){ if ( !(token.role & ENDOFSENTENCE) ){ if ( splitOnly && (token.role & NOSPACE) ){ } else { outline += " "; } } } else if ( (quotelevel > 0) && sentenceperlineoutput ) { //FBK: ADD SPACE WITHIN QUOTE CONTEXT IN ANY CASE outline += " "; } } if (tokDebug >= 5){ LOG << "outputTokens: outline=" << outline << endl; } result += outline; } return result; } int TokenizerClass::countSentences( bool forceentirebuffer ) { //Return the number of *completed* sentences in the token buffer //Performs extra sanity checks at the same time! Making sure //BEGINOFSENTENCE and ENDOFSENTENCE always pair up, and that TEMPENDOFSENTENCE roles //are converted to proper ENDOFSENTENCE markers short quotelevel = 0; int count = 0; const int size = tokens.size(); int begin = 0; int i = 0; for ( auto& token : tokens ) { if (tokDebug >= 5){ LOG << "[countSentences] buffer#" <= 5){ LOG << "[countSentences] SENTENCE #" << count << " *FORCIBLY* ended" << endl; } } ++i; } if (tokDebug >= 5){ LOG << "[countSentences] end of loop: returns " << count << endl; } return count; } vector TokenizerClass::popSentence( ) { vector outToks; const int size = tokens.size(); if ( size != 0 ){ short quotelevel = 0; size_t begin = 0; for ( int i = 0; i < size; ++i ) { if (tokens[i].role & NEWPARAGRAPH) { quotelevel = 0; } else if (tokens[i].role & ENDQUOTE) { --quotelevel; } if ( (tokens[i].role & BEGINOFSENTENCE) && (quotelevel == 0)) { begin = i; } //FBK: QUOTELEVEL GOES UP BEFORE begin IS UPDATED... RESULTS IN DUPLICATE OUTPUT if (tokens[i].role & BEGINQUOTE) { ++quotelevel; } if ((tokens[i].role & ENDOFSENTENCE) && (quotelevel == 0)) { size_t end = i; if (tokDebug >= 1){ LOG << "[tokenize] extracted sentence, begin=" << begin << ",end="<< end << endl; } for ( size_t index=begin; index <= end; ++index ){ outToks.push_back( tokens[index] ); } tokens.erase( tokens.begin(), tokens.begin()+end+1 ); if ( !passthru ){ string lang = get_language( outToks ); if ( !settings[lang]->quotes.emptyStack() ) { settings[lang]->quotes.flushStack( end+1 ); } } // we are done... return outToks; } } } return outToks; } UnicodeString TokenizerClass::getString( const vector& v ){ if ( !v.empty() ){ //This only makes sense in non-verbose mode, force verbose=false const bool tv = verbose; verbose = false; UnicodeString res = outputTokens( v ); verbose = tv; return res; } return ""; } string TokenizerClass::getUTF8String( const vector& v ){ UnicodeString result = getString( v ); return TiCC::UnicodeToUTF8( result ); } vector TokenizerClass::getSentences() { vector sentences; if (tokDebug > 0) { LOG << "[getSentences()] before countSent " << endl; } int numS = countSentences(true); // force buffer to end with END_OF_SENTENCE if (tokDebug > 0) { LOG << "[getSentences] found " << numS << " sentence(s)" << endl; } for (int i = 0; i < numS; i++) { vector v = popSentence( ); UnicodeString tmp = getString( v ); sentences.push_back( tmp ); } return sentences; } vector TokenizerClass::getUTF8Sentences() { vector uv = getSentences(); vector result; for ( const auto& us : uv ){ result.push_back( TiCC::UnicodeToUTF8(us) ); } return result; } // FBK: return true if character is a quote. bool TokenizerClass::u_isquote( UChar32 c, const Quoting& quotes ) const { bool quote = false; if ( u_hasBinaryProperty( c, UCHAR_QUOTATION_MARK ) || c == '`' || c == U'´' ) { // M$ users use the spacing grave and acute accents often as a // quote (apostroph) but is DOESN`T have the UCHAR_QUOTATION_MARK property // so trick that quote = true; } else { UnicodeString opening = quotes.lookupOpen( c ); if (!opening.isEmpty()) { quote = true; } else { UnicodeString closing = quotes.lookupClose( c ); if (!closing.isEmpty()) { quote = true; } } } return quote; } //FBK: USED TO CHECK IF CHARACTER AFTER QUOTE IS AN BOS. //MOSTLY THE SAME AS ABOVE, EXCEPT WITHOUT CHECK FOR PUNCTUATION //BECAUSE: '"Hoera!", zei de man' MUST NOT BE SPLIT ON ','.. bool is_BOS( UChar32 c ){ bool is_bos = false; UBlockCode s = ublock_getCode(c); //test for languages that distinguish case if ( (s == UBLOCK_BASIC_LATIN) || (s == UBLOCK_GREEK) || (s == UBLOCK_CYRILLIC) || (s == UBLOCK_GEORGIAN) || (s == UBLOCK_ARMENIAN) || (s == UBLOCK_DESERET)) { if ( u_isupper(c) || u_istitle(c) ) { //next 'word' starts with more punctuation or with uppercase is_bos = true; } } return is_bos; } bool TokenizerClass::resolveQuote( int endindex, const UnicodeString& open, Quoting& quotes ) { //resolve a quote int stackindex = -1; int beginindex = quotes.lookup( open, stackindex ); if (beginindex >= 0) { if (tokDebug >= 2) { LOG << "[resolveQuote] Quote found, begin="<< beginindex << ", end="<< endindex << endl; } if (beginindex > endindex) { throw uRangeError( "Begin index for quote is higher than end index!" ); } //We have a quote! //resolve sentences within quote, all sentences must be full sentences: int beginsentence = beginindex + 1; int expectingend = 0; int subquote = 0; int size = tokens.size(); for (int i = beginsentence; i < endindex; i++) { if (tokens[i].role & BEGINQUOTE) subquote++; if (subquote == 0) { if (tokens[i].role & BEGINOFSENTENCE) expectingend++; if (tokens[i].role & ENDOFSENTENCE) expectingend--; if (tokens[i].role & TEMPENDOFSENTENCE) { tokens[i].role &= ~TEMPENDOFSENTENCE; tokens[i].role |= ENDOFSENTENCE; tokens[beginsentence].role |= BEGINOFSENTENCE; beginsentence = i + 1; } // In case of nested quoted sentences, such as: // MvD: "Nou, Van het Gouden Been ofzo herinner ik mij als kind: 'Waar is mijn gouden been?'" // the BEGINOFSENTENCE is only set for the inner quoted sentence 'Waar is mijn gouden been'. However, // We also need one for the outser sentence. } else if ( (tokens[i].role & ENDQUOTE) && (tokens[i].role & ENDOFSENTENCE)) { tokens[beginsentence].role |= BEGINOFSENTENCE; beginsentence = i + 1; } if (tokens[i].role & ENDQUOTE) subquote--; } if ((expectingend == 0) && (subquote == 0)) { //ok, all good, mark the quote: tokens[beginindex].role |= BEGINQUOTE; tokens[endindex].role |= ENDQUOTE; if ( tokDebug >= 2 ) { LOG << "marked BEGIN: " << tokens[beginindex] << endl; LOG << "marked END: " << tokens[endindex] << endl; } } else if ( expectingend == 1 && subquote == 0 && !( tokens[endindex - 1].role & ENDOFSENTENCE) ) { //missing one endofsentence, we can correct, last token in quote token is endofsentence: if ( tokDebug >= 2 ) { LOG << "[resolveQuote] Missing endofsentence in quote, fixing... " << expectingend << endl; } tokens[endindex - 1].role |= ENDOFSENTENCE; //mark the quote tokens[beginindex].role |= BEGINQUOTE; tokens[endindex].role |= ENDQUOTE; } else { if ( tokDebug >= 2) { LOG << "[resolveQuote] Quote can not be resolved, unbalanced sentences or subquotes within quote, skipping... (expectingend=" << expectingend << ",subquote=" << subquote << ")" << endl; } //something is wrong. Sentences within quote are not balanced, so we won't mark the quote. } //remove from stack (ok, granted, stack is a bit of a misnomer here) quotes.eraseAtPos( stackindex ); //FBK: ENDQUOTES NEED TO BE MARKED AS ENDOFSENTENCE IF THE PREVIOUS TOKEN //WAS AN ENDOFSENTENCE. OTHERWISE THE SENTENCES WILL NOT BE SPLIT. if ( tokens[endindex].role & ENDQUOTE && tokens[endindex-1].role & ENDOFSENTENCE ) { //FBK: CHECK FOR EOS AFTER QUOTES if ((endindex+1 == size) || //FBK: endindex EQUALS TOKEN SIZE, MUST BE EOSMARKERS ((endindex + 1 < size) && (is_BOS(tokens[endindex+1].us[0])))) { tokens[endindex].role |= ENDOFSENTENCE; // FBK: CHECK IF NEXT TOKEN IS A QUOTE AND NEXT TO THE QUOTE A BOS } else if ( endindex + 2 < size && u_isquote( tokens[endindex+1].us[0], quotes ) && is_BOS( tokens[endindex+2].us[0] ) ) { tokens[endindex].role |= ENDOFSENTENCE; // If the current token is an ENDQUOTE and the next token is a quote and also the last token, // the current token is an EOS. } else if ( endindex + 2 == size && u_isquote( tokens[endindex+1].us[0], quotes ) ) { tokens[endindex].role |= ENDOFSENTENCE; } } return true; } else { return false; } } bool TokenizerClass::detectEos( size_t i, const UnicodeString& eosmarkers, const Quoting& quotes ) const { bool is_eos = false; UChar32 c = tokens[i].us.char32At(0); if ( c == '.' || eosmarkers.indexOf( c ) >= 0 ){ if (i + 1 == tokens.size() ) { //No next character? is_eos = true; //Newline after eosmarker } else { c = tokens[i+1].us.char32At(0); if ( u_isquote( c, quotes ) ){ // next word is quote if ( detectQuotes ){ is_eos = true; } else if ( i + 2 < tokens.size() ) { c = tokens[i+2].us.char32At(0); if ( u_isupper(c) || u_istitle(c) || u_ispunct(c) ){ //next 'word' after quote starts with uppercase or is punct is_eos = true; } } } else if ( tokens[i].us.length() > 1 ){ // PUNCTUATION multi... if ( u_isupper(c) || u_istitle(c) ) is_eos = true; } else is_eos = true; } } return is_eos; } void TokenizerClass::detectQuoteBounds( const int i, Quoting& quotes ) { UChar32 c = tokens[i].us.char32At(0); //Detect Quotation marks if ((c == '"') || ( UnicodeString(c) == "＂") ) { if (tokDebug > 1 ){ LOG << "[detectQuoteBounds] Standard double-quote (ambiguous) found @i="<< i << endl; } if (!resolveQuote(i,c,quotes)) { if (tokDebug > 1 ) { LOG << "[detectQuoteBounds] Doesn't resolve, so assuming beginquote, pushing to stack for resolution later" << endl; } quotes.push( i, c ); } } else if ( c == '\'' ) { if (tokDebug > 1 ){ LOG << "[detectQuoteBounds] Standard single-quote (ambiguous) found @i="<< i << endl; } if (!resolveQuote(i,c,quotes)) { if (tokDebug > 1 ) { LOG << "[detectQuoteBounds] Doesn't resolve, so assuming beginquote, pushing to stack for resolution later" << endl; } quotes.push( i, c ); } } else { UnicodeString close = quotes.lookupOpen( c ); if ( !close.isEmpty() ){ // we have a opening quote if ( tokDebug > 1 ) { LOG << "[detectQuoteBounds] Opening quote found @i="<< i << ", pushing to stack for resolution later..." << endl; } quotes.push( i, c ); // remember it } else { UnicodeString open = quotes.lookupClose( c ); if ( !open.isEmpty() ) { // we have a closing quote if (tokDebug > 1 ) { LOG << "[detectQuoteBounds] Closing quote found @i="<< i << ", attempting to resolve..." << endl; } if ( !resolveQuote( i, open, quotes )) { // resolve the matching opening if (tokDebug > 1 ) { LOG << "[detectQuoteBounds] Unable to resolve" << endl; } } } } } } bool isClosing( const Token& tok ){ if ( tok.us.length() == 1 && ( tok.us[0] == ')' || tok.us[0] == '}' || tok.us[0] == ']' || tok.us[0] == '>' ) ) return true; return false; } void TokenizerClass::detectSentenceBounds( const int offset, const string& lang ){ //find sentences string method; if ( detectQuotes ){ method = "[detectSentenceBounds-(quoted)]"; } else { method = "[detectSentenceBounds]"; } const int size = tokens.size(); for (int i = offset; i < size; i++) { if (tokDebug > 1 ){ LOG << method << " i="<< i << " word=[" << tokens[i].us << "] type=" << tokens[i].type << ", role=" << tokens[i].role << endl; } if ( tokens[i].type.startsWith("PUNCTUATION") ){ if ((tokDebug > 1 )){ LOG << method << " PUNCTUATION FOUND @i=" << i << endl; } // we have some kind of punctuation. Does it mark an eos? bool is_eos = detectEos( i, settings[lang]->eosmarkers, settings[lang]->quotes ); if (is_eos) { // end of sentence found/ so wrap up if ( detectQuotes && !settings[lang]->quotes.emptyStack() ) { // we have some quotes! if ( tokDebug > 1 ){ LOG << method << " Unbalances quotes: Preliminary EOS FOUND @i=" << i << endl; } // we set a temporary EOS marker, // to be resolved later when full quote is found. tokens[i].role |= TEMPENDOFSENTENCE; // If previous token is also TEMPENDOFSENTENCE, // it stops being so in favour of this one if ( i > 0 ){ tokens[i-1].role &= ~TEMPENDOFSENTENCE; } } else { // No quotes if ( tokDebug > 1 ){ LOG << method << " EOS FOUND @i=" << i << endl; } tokens[i].role |= ENDOFSENTENCE; // if this is the end of the sentence, // the next token is the beginning of a new one if ( (i + 1) < size ){ tokens[i+1].role |= BEGINOFSENTENCE; } // if previous token is EOS and not BOS, it will stop being EOS, // as this one will take its place if ( i > 0 && ( tokens[i-1].role & ENDOFSENTENCE ) && !( tokens[i-1].role & BEGINOFSENTENCE ) ) { tokens[i-1].role &= ~ENDOFSENTENCE; tokens[i].role &= ~BEGINOFSENTENCE; } } } else if ( isClosing(tokens[i] ) ) { // we have a closing symbol if ( tokDebug > 1 ){ LOG << method << " Close FOUND @i=" << i << endl; } //if previous token is EOS and not BOS, it will stop being EOS, as this one will take its place if ( i > 0 && ( tokens[i-1].role & ENDOFSENTENCE ) && !( tokens[i-1].role & BEGINOFSENTENCE) ) { tokens[i-1].role &= ~ENDOFSENTENCE; tokens[i].role &= ~BEGINOFSENTENCE; } } if ( detectQuotes ){ // check the quotes detectQuoteBounds( i, settings[lang]->quotes ); } } } for (int i = size-1; i > offset; --i ) { // at the end of the buffer there may be some PUNCTUATION which // has spurious ENDOFSENTENCE and BEGINOFSENTENCE annotation // fix this up to avoid sentences containing only punctuation // also we don't want a BEGINQUOTE to be an ENDOFSENTENCE if ( tokDebug > 2 ){ LOG << method << " fixup-end i="<< i << " word=[" << tokens[i].us << "] type=" << tokens[i].type << ", role=" << tokens[i].role << endl; } if ( tokens[i].type.startsWith("PUNCTUATION") ) { tokens[i].role &= ~BEGINOFSENTENCE; if ( !detectQuotes || (tokens[i].role & BEGINQUOTE) ){ if ( i != size-1 ){ tokens[i].role &= ~ENDOFSENTENCE; } } } else break; } } void TokenizerClass::passthruLine( const UnicodeString& input, bool& bos ) { if (tokDebug) { LOG << "[passthruLine] input: line=[" << input << "]" << endl; } bool alpha = false, num = false, punct = false; UnicodeString word; StringCharacterIterator sit(input); while ( sit.hasNext() ){ UChar32 c = sit.current32(); if ( c == u'\u200D' ){ // a joiner. just ignore sit.next32(); continue; } if ( u_isspace(c) ) { if ( word.isEmpty() ){ // a leading space. Don't waste time on it. SKIP sit.next32(); continue; } // so a trailing space. handle the found word. if (tokDebug){ LOG << "[passthruLine] word=[" << word << "]" << endl; } if ( word == eosmark ) { word = ""; if (!tokens.empty()) tokens.back().role |= ENDOFSENTENCE; bos = true; } else { UnicodeString type; if (alpha && !num && !punct) { type = type_word; } else if (num && !alpha && !punct) { type = type_number; } else if (punct && !alpha && !num) { type = type_punctuation; } else { type = type_unknown; } if ( doPunctFilter && ( type == type_punctuation || type == type_currency || type == type_emoticon || type == type_picto ) ) { if (tokDebug >= 2 ){ LOG << " [passThruLine] skipped PUNCTUATION [" << input << "]" << endl; } if ( !tokens.empty() ){ tokens.back().role &= ~NOSPACE; } } else { if ( norm_set.find( type ) != norm_set.end() ){ word = "{{" + type + "}}"; } if (bos) { tokens.push_back( Token( type, word , BEGINOFSENTENCE ) ); bos = false; } else { tokens.push_back( Token( type, word ) ); } } alpha = false; num = false; punct = false; word = ""; } } else { if ( u_isalpha(c)) { alpha = true; } else if (u_ispunct(c)) { punct = true; } else if (u_isdigit(c)) { num = true; } word += c; } sit.next32(); } if (word != "") { if ( word == eosmark ) { word = ""; if (!tokens.empty()) tokens.back().role |= ENDOFSENTENCE; } else { UnicodeString type; if (alpha && !num && !punct) { type = type_word; } else if (num && !alpha && !punct) { type = type_number; } else if (punct && !alpha && !num) { type = type_punctuation; } else { type = type_unknown; } if ( doPunctFilter && ( type == type_punctuation || type == type_currency || type == type_emoticon || type == type_picto ) ) { if (tokDebug >= 2 ){ LOG << " [passThruLine] skipped PUNCTUATION [" << input << "]" << endl; } if ( !tokens.empty() ){ tokens.back().role &= ~NOSPACE; } } else { if ( norm_set.find( type ) != norm_set.end() ){ word = "{{" + type + "}}"; } if (bos) { tokens.push_back( Token( type, word , BEGINOFSENTENCE ) ); bos = false; } else { tokens.push_back( Token( type, word ) ); } } } } if ( sentenceperlineinput && tokens.size() > 0 ) { tokens[0].role |= BEGINOFSENTENCE; tokens.back().role |= ENDOFSENTENCE; } } string TokenizerClass::checkBOM( istream& in ){ string result = inputEncoding; if ( &in == &cin ){ return result; } streampos pos = in.tellg(); string s; in >> s; UErrorCode err = U_ZERO_ERROR; int32_t bomLength = 0; const char *encoding = ucnv_detectUnicodeSignature( s.c_str(), s.length(), &bomLength, &err); if ( bomLength ){ if ( tokDebug ){ LOG << "Autodetected encoding: " << encoding << endl; } result = encoding; if ( result == "UTF16BE" || result == "UTF-16BE" ){ result = "UTF16BE"; } } in.seekg( pos + (streampos)bomLength ); return result; } // string wrapper void TokenizerClass::tokenizeLine( const string& s, const string& lang ){ UnicodeString us = convert( s, inputEncoding ); tokenizeLine( us, lang ); } // UnicodeString wrapper void TokenizerClass::tokenizeLine( const UnicodeString& us, const string& lang ){ bool bos = true; tokenize_one_line( us, bos, lang ); if (tokDebug > 0) { LOG << "[tokenizeLine()] before countSent " << endl; } countSentences(true); // force the ENDOFSENTENCE } bool u_isemo( UChar32 c ){ UBlockCode s = ublock_getCode(c); return s == UBLOCK_EMOTICONS; } bool u_ispicto( UChar32 c ){ UBlockCode s = ublock_getCode(c); return s == UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS ; } bool u_iscurrency( UChar32 c ){ return u_charType( c ) == U_CURRENCY_SYMBOL; } bool u_issymbol( UChar32 c ){ return u_charType( c ) == U_CURRENCY_SYMBOL || u_charType( c ) == U_MATH_SYMBOL || u_charType( c ) == U_MODIFIER_SYMBOL || u_charType( c ) == U_OTHER_SYMBOL; } const UnicodeString& detect_type( UChar32 c ){ if ( u_isspace(c)) { return type_space; } else if ( u_iscurrency(c)) { return type_currency; } else if ( u_ispunct(c)) { return type_punctuation; } else if ( u_isemo( c ) ) { return type_emoticon; } else if ( u_ispicto( c ) ) { return type_picto; } else if ( u_isalpha(c)) { return type_word; } else if ( u_isdigit(c)) { return type_number; } else if ( u_issymbol(c)) { return type_symbol; } else { return type_unknown; } } std::string toString( int8_t c ){ switch ( c ){ case 0: return "U_UNASSIGNED"; case 1: return "U_UPPERCASE_LETTER"; case 2: return "U_LOWERCASE_LETTER"; case 3: return "U_TITLECASE_LETTER"; case 4: return "U_MODIFIER_LETTER"; case 5: return "U_OTHER_LETTER"; case 6: return "U_NON_SPACING_MARK"; case 7: return "U_ENCLOSING_MARK"; case 8: return "U_COMBINING_SPACING_MARK"; case 9: return "U_DECIMAL_DIGIT_NUMBER"; case 10: return "U_LETTER_NUMBER"; case 11: return "U_OTHER_NUMBER"; case 12: return "U_SPACE_SEPARATOR"; case 13: return "U_LINE_SEPARATOR"; case 14: return "U_PARAGRAPH_SEPARATOR"; case 15: return "U_CONTROL_CHAR"; case 16: return "U_FORMAT_CHAR"; case 17: return "U_PRIVATE_USE_CHAR"; case 18: return "U_SURROGATE"; case 19: return "U_DASH_PUNCTUATION"; case 20: return "U_START_PUNCTUATION"; case 21: return "U_END_PUNCTUATION"; case 22: return "U_CONNECTOR_PUNCTUATION"; case 23: return "U_OTHER_PUNCTUATION"; case 24: return "U_MATH_SYMBOL"; case 25: return "U_CURRENCY_SYMBOL"; case 26: return "U_MODIFIER_SYMBOL"; case 27: return "U_OTHER_SYMBOL"; case 28: return "U_INITIAL_PUNCTUATION"; case 29: return "U_FINAL_PUNCTUATION"; default: return "OMG NO CLUE WHAT KIND OF SYMBOL THIS IS: " + TiCC::toString( int(c) ); } } int TokenizerClass::internal_tokenize_line( const UnicodeString& originput, const string& _lang ){ string lang = _lang; if ( lang.empty() ){ lang = "default"; } else { auto const it = settings.find( lang ); if ( it == settings.end() ){ LOG << "tokenizeLine: no settings found for language=" + lang << endl << "using the default language instead:" << default_language << endl; lang = "default"; } } if (tokDebug){ LOG << "[tokenizeLine] input: line=[" << originput << "] (language= " << lang << ")" << endl; } UnicodeString input = normalizer.normalize( originput ); if ( doFilter ){ input = settings[lang]->filter.filter( input ); } if ( input.isBogus() ){ //only tokenize valid input LOG << "ERROR: Invalid UTF-8 in line:" << linenum << endl << " '" << input << "'" << endl; return 0; } int32_t len = input.countChar32(); if (tokDebug){ LOG << "[tokenizeLine] filtered input: line=[" << input << "] (" << len << " unicode characters)" << endl; } const int begintokencount = tokens.size(); if (tokDebug) { LOG << "[tokenizeLine] Tokens still in buffer: " << begintokencount << endl; } bool tokenizeword = false; bool reset = false; //iterate over all characters UnicodeString word; StringCharacterIterator sit(input); long int i = 0; long int tok_size = 0; while ( sit.hasNext() ){ UChar32 c = sit.current32(); bool joiner = false; if ( c == u'\u200D' ){ joiner = true; } if ( tokDebug > 8 ){ UnicodeString s = c; int8_t charT = u_charType( c ); LOG << "examine character: " << s << " type= " << toString( charT ) << endl; } if (reset) { //reset values for new word reset = false; tok_size = 0; if ( !joiner && !u_isspace(c) ){ word = c; } else { word = ""; } tokenizeword = false; } else if ( !joiner && !u_isspace(c) ){ word += c; } if ( joiner && sit.hasNext() ){ UChar32 peek = sit.next32(); if ( u_isspace(peek) ){ joiner = false; } sit.previous32(); } if ( u_isspace(c) || joiner || i == len-1 ){ if (tokDebug){ LOG << "[tokenizeLine] space detected, word=[" << word << "]" << endl; } if ( i == len-1 ) { if ( joiner || u_ispunct(c) || u_isdigit(c) || u_isquote( c, settings[lang]->quotes ) || u_isemo(c) ){ tokenizeword = true; } } if ( c == '\n' && word.isEmpty() ){ if (tokDebug){ LOG << "[tokenizeLine] NEW PARAGRAPH upcoming " << endl; } // signal that the next word starts a new Paragraph. (if its there) paragraphsignal_next = true; } int expliciteosfound = -1; if ( word.length() >= eosmark.length() ) { expliciteosfound = word.lastIndexOf(eosmark); if (expliciteosfound != -1) { // word contains eosmark if ( tokDebug >= 2){ LOG << "[tokenizeLine] Found explicit EOS marker @"< 0) { UnicodeString realword; word.extract(0,expliciteosfound,realword); if (tokDebug >= 2) { LOG << "[tokenizeLine] Prefix before EOS: " << realword << endl; } tokenizeWord( realword, false, lang ); eospos++; } if ( expliciteosfound + eosmark.length() < word.length() ){ UnicodeString realword; word.extract( expliciteosfound+eosmark.length(), word.length() - expliciteosfound - eosmark.length(), realword ); if (tokDebug >= 2){ LOG << "[tokenizeLine] postfix after EOS: " << realword << endl; } tokenizeWord( realword, true, lang ); } if ( !tokens.empty() && eospos >= 0 ) { if (tokDebug >= 2){ LOG << "[tokenizeLine] Assigned EOS" << endl; } tokens[eospos].role |= ENDOFSENTENCE; } } } if ( word.length() > 0 && expliciteosfound == -1 ) { if (tokDebug >= 2){ LOG << "[tokenizeLine] Further tokenization necessary for: [" << word << "]" << endl; } if ( tokenizeword ) { tokenizeWord( word, !joiner, lang ); } else { tokenizeWord( word, !joiner, lang, type_word ); } } //reset values for new word reset = true; } else if ( u_ispunct(c) || u_isdigit(c) || u_isquote( c, settings[lang]->quotes ) || u_isemo(c) ){ if (tokDebug){ LOG << "[tokenizeLine] punctuation or digit detected, word=[" << word << "]" << endl; } //there is punctuation or digits in this word, mark to run through tokenizer tokenizeword = true; } sit.next32(); ++i; ++tok_size; if ( tok_size > 2500 ){ LOG << "Ridiculously long word/token (over 2500 characters) detected " << "in line: " << linenum << ". Skipped ..." << endl; LOG << "The line starts with " << UnicodeString( word, 0, 75 ) << "..." << endl; return 0; } } int numNewTokens = tokens.size() - begintokencount; if (tokDebug >= 10){ LOG << "tokens.size() = " << tokens.size() << endl; LOG << "begintokencount = " << begintokencount << endl; LOG << "numnew = " << numNewTokens << endl; } if ( numNewTokens > 0 ){ if (paragraphsignal) { tokens[begintokencount].role |= NEWPARAGRAPH | BEGINOFSENTENCE; paragraphsignal = false; } //find sentence boundaries if (sentenceperlineinput) { // force it to be a sentence tokens[begintokencount].role |= BEGINOFSENTENCE; tokens.back().role |= ENDOFSENTENCE; } detectSentenceBounds( begintokencount ); } return numNewTokens; } void TokenizerClass::tokenizeWord( const UnicodeString& input, bool space, const string& lang, const UnicodeString& assigned_type ) { bool recurse = !assigned_type.isEmpty(); int32_t inpLen = input.countChar32(); if ( tokDebug > 2 ){ if ( recurse ){ LOG << " [tokenizeWord] Recurse Input: (" << inpLen << ") " << "word=[" << input << "], type=" << assigned_type << " Space=" << (space?"TRUE":"FALSE") << endl; } else { LOG << " [tokenizeWord] Input: (" << inpLen << ") " << "word=[" << input << "]" << " Space=" << (space?"TRUE":"FALSE") << endl; } } if ( input == eosmark ) { if (tokDebug >= 2){ LOG << " [tokenizeWord] Found explicit EOS marker" << endl; } if (!tokens.empty()) { if (tokDebug >= 2){ LOG << " [tokenizeWord] Assigned EOS" << endl; } tokens.back().role |= ENDOFSENTENCE; } else { LOG << "[WARNING] Found explicit EOS marker by itself, this will have no effect!" << endl; } return; } if ( inpLen == 1) { //single character, no need to process all rules, do some simpler (faster) detection UChar32 c = input.char32At(0); UnicodeString type = detect_type( c ); if ( type == type_space ){ return; } if ( doPunctFilter && ( type == type_punctuation || type == type_currency || type == type_emoticon || type == type_picto ) ) { if (tokDebug >= 2 ){ LOG << " [tokenizeWord] skipped PUNCTUATION [" << input << "]" << endl; } if ( !tokens.empty() ){ tokens.back().role &= ~NOSPACE; } } else { UnicodeString word = input; if ( norm_set.find( type ) != norm_set.end() ){ word = "{{" + type + "}}"; } TokenRole role = (space ? NOROLE : NOSPACE); if ( paragraphsignal_next ){ role |= NEWPARAGRAPH; paragraphsignal_next = false; } Token T( type, word, role, lang ); tokens.push_back( T ); if (tokDebug >= 2){ LOG << " [tokenizeWord] added token " << T << endl; } } } else { bool a_rule_matched = false; for ( const auto& rule : settings[lang]->rules ) { if ( tokDebug >= 4){ LOG << "\tTESTING " << rule->id << endl; } UnicodeString type = rule->id; //Find first matching rule UnicodeString pre, post; vector matches; if ( rule->matchAll( input, pre, post, matches ) ){ a_rule_matched = true; if ( tokDebug >= 4 ){ LOG << "\tMATCH: " << type << endl; LOG << "\tpre= '" << pre << "'" << endl; LOG << "\tpost= '" << post << "'" << endl; int cnt = 0; for ( const auto& m : matches ){ LOG << "\tmatch[" << ++cnt << "]=" << m << endl; } } if ( recurse && ( type == type_word || ( pre.isEmpty() && post.isEmpty() ) ) ){ // so only do this recurse step when: // OR we have a WORD // OR we have an exact match of the rule (no pre or post) if ( assigned_type != type_word ){ // don't change the type when: // it was already non-WORD if ( tokDebug >= 4 ){ LOG << "\trecurse, match didn't do anything new for " << input << endl; } TokenRole role = (space ? NOROLE : NOSPACE); if ( paragraphsignal_next ){ role |= NEWPARAGRAPH; paragraphsignal_next = false; } tokens.push_back( Token( assigned_type, input, role, lang ) ); return; } else { if ( tokDebug >= 4 ){ LOG << "\trecurse, match changes the type:" << assigned_type << " to " << type << endl; } TokenRole role = (space ? NOROLE : NOSPACE); if ( paragraphsignal_next ){ role |= NEWPARAGRAPH; paragraphsignal_next = false; } tokens.push_back( Token( type, input, role, lang ) ); return; } } if ( pre.length() > 0 ){ if ( tokDebug >= 4 ){ LOG << "\tTOKEN pre-context (" << pre.length() << "): [" << pre << "]" << endl; } tokenizeWord( pre, false, lang ); //pre-context, no space after } if ( matches.size() > 0 ){ int max = matches.size(); if ( tokDebug >= 4 ){ LOG << "\tTOKEN match #=" << matches.size() << endl; } for ( int m=0; m < max; ++m ){ if ( tokDebug >= 4 ){ LOG << "\tTOKEN match[" << m << "] = " << matches[m] << " Space=" << (space?"TRUE":"FALSE") << endl; } if ( doPunctFilter && (&rule->id)->startsWith("PUNCTUATION") ){ if (tokDebug >= 2 ){ LOG << " [tokenizeWord] skipped PUNCTUATION [" << matches[m] << "]" << endl; } if ( !tokens.empty() ){ tokens.back().role &= ~NOSPACE; } } else { bool internal_space = space; if ( post.length() > 0 ) { internal_space = false; } else if ( m < max-1 ){ internal_space = false; } UnicodeString word = matches[m]; if ( norm_set.find( type ) != norm_set.end() ){ word = "{{" + type + "}}"; TokenRole role = (internal_space ? NOROLE : NOSPACE); if ( paragraphsignal_next ){ role |= NEWPARAGRAPH; paragraphsignal_next = false; } tokens.push_back( Token( type, word, role, lang ) ); } else { if ( recurse ){ TokenRole role = (internal_space ? NOROLE : NOSPACE); if ( paragraphsignal_next ){ role |= NEWPARAGRAPH; paragraphsignal_next = false; } tokens.push_back( Token( type, word, role, lang ) ); } else { tokenizeWord( word, internal_space, lang, type ); } } } } } else if ( tokDebug >=4 ){ // should never come here? LOG << "\tPANIC there's no match" << endl; } if ( post.length() > 0 ){ if ( tokDebug >= 4 ){ LOG << "\tTOKEN post-context (" << post.length() << "): [" << post << "]" << endl; } tokenizeWord( post, space, lang ); } break; } } if ( !a_rule_matched ){ // no rule matched if ( tokDebug >=4 ){ LOG << "\tthere's no match at all" << endl; } TokenRole role = (space ? NOROLE : NOSPACE); if ( paragraphsignal_next ){ role |= NEWPARAGRAPH; paragraphsignal_next = false; } tokens.push_back( Token( assigned_type, input, role, lang ) ); } } } string TokenizerClass::get_data_version() const { return UCTODATA_VERSION; } bool TokenizerClass::init( const string& fname, const string& tname ){ if ( tokDebug ){ LOG << "Initiating tokenizer..." << endl; } data_version = get_data_version(); Setting *set = new Setting(); if ( !set->read( fname, tname, tokDebug, theErrLog ) ){ LOG << "Cannot read Tokenizer settingsfile " << fname << endl; LOG << "Unsupported language? (Did you install the uctodata package?)" << endl; return false; } else { settings["default"] = set; default_language = "default"; auto pos = fname.find("tokconfig-"); if ( pos != string::npos ){ default_language = fname.substr(pos+10); settings[default_language] = set; } else if ( xmlout ){ LOG << " unable to determine a language. cannot proceed" << endl; return false; } } if ( tokDebug ){ LOG << "effective rules: " << endl; for ( size_t i=0; i < set->rules.size(); ++i ){ LOG << "rule " << i << " " << *(set->rules[i]) << endl; } LOG << "EOS markers: " << set->eosmarkers << endl; LOG << "Quotations: " << set->quotes << endl; try { LOG << "Filter: " << set->filter << endl; } catch (...){ } } return true; } bool TokenizerClass::init( const vector& languages, const string& tname ){ if ( tokDebug > 0 ){ LOG << "Initiating tokenizer from language list..." << endl; } data_version = get_data_version(); Setting *default_set = 0; for ( const auto& lang : languages ){ if ( tokDebug > 0 ){ LOG << "init language=" << lang << endl; } string fname = "tokconfig-" + lang; Setting *set = new Setting(); string add; if ( default_set == 0 ){ add = tname; } if ( !set->read( fname, add, tokDebug, theErrLog ) ){ LOG << "problem reading datafile for language: " << lang << endl; LOG << "Unsupported language (Did you install the uctodata package?)" << endl; } else { if ( default_set == 0 ){ default_set = set; settings["default"] = set; default_language = lang; } settings[lang] = set; } } if ( settings.empty() ){ cerr << "ucto: No useful settingsfile(s) could be found (initiating from language list: " << languages << ")" << endl; return false; } return true; } string get_language( const vector& tv ){ // examine the assigned languages of ALL tokens. // they should all be the same // assign that value string result = "default"; for ( const auto& t : tv ){ if ( !t.lang_code.empty() && t.lang_code != "default" ){ if ( result == "default" ){ result = t.lang_code; } if ( result != t.lang_code ){ throw logic_error( "ucto: conflicting language(s) assigned" ); } } } return result; } bool TokenizerClass::get_setting_info( const std::string& language, std::string& set_file, std::string& version ) const { set_file.clear(); version.clear(); auto const& it = settings.find( language ); if ( it == settings.end() ){ return false; } else { set_file = it->second->set_file; version = it->second->version; return true; } } } //namespace Tokenizer