/* ============================================================================ * Douglas Thrift's Search Engine License * * Copyright (C) 2002-2004, 2008, Douglas Thrift. All Rights Reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. The end-user documentation included with the redistribution, if any, must * include the following acknowledgment: * * "This product includes software developed by Douglas Thrift * (http://computers.douglasthrift.net/searchengine/)." * * Alternately, this acknowledgment may appear in the software itself, if * and wherever such third-party acknowledgments normally appear. * * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not * be used to endorse or promote products derived from this software without * specific prior written permission. For written permission, please visit * http://www.douglasthrift.net/contact.cgi for contact information. * * 5. Products derived from this software may not be called "Douglas Thrift's * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their * name, without prior written permission. * * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ============================================================================ */ // Douglas Thrift's Search Engine Ranker // // Douglas Thrift // // $Id: Ranker.cpp 372 2008-08-23 11:00:12Z douglas $ #include "Ranker.hpp" void Ranker::rank(vector query) { vector prep; for (size_t index(0); index < query.size(); index++) { if (query[index] == "allintitle:" && index == 0) { allIn = title; } else if (query[index] == "allinurl:" && index == 0) { allIn = url; } else if (query[index] == "allintext:" && index == 0) { allIn = text; } else if (query[index].find("site:") == 0 && query[index].size() > 5) { site = query[index].substr(5); } else if (query[index].find("intitle:") == 0 && query[index].size() > 8) { prep.push_back("TITLE " + query[index].substr(8)); } else if (query[index].find("inurl:") == 0 && query[index].size() > 6) { prep.push_back("URL " + query[index].substr(6)); } else if (query[index].find("intext:") == 0 && query[index].size() > 7) { prep.push_back("TEXT " + query[index].substr(7)); } else { prep.push_back(query[index]); } } if (prep.size() > 0) { bool or_(false); for (size_t index(0); index < prep.size(); index++) { bool exclude(false); if (prep[index].find('+') == 0) { prep[index].erase(0, 1); } else if (prep[index].find('-') == 0) { exclude = true; prep[index].erase(0, 1); } if (or_) { if (prep[index].find(" OR") == string::npos) { or_ = false; } eitherOr[eitherOr.size() - 1] += ' ' + prep[index]; } else if (exclude) { excluded.push_back(prep[index]); } else if (prep[index].find(" OR") != string::npos) { or_ = true; eitherOr.push_back(prep[index]); } else { required.push_back(prep[index]); } } } rank(); } void Ranker::setSample() { map::iterator itor; multimap::iterator> distances; for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++) { size_t distance; if (++itor != occurrencesText.end()) { size_t next(itor->first); itor--; distance = next - (itor->first + itor->second); } else { distance = string::npos; itor--; } distances.insert(pair::iterator>(distance, itor)); } if (distances.begin() != distances.end()) { itor = distances.begin()->second; } string portion; size_t sampleLength(0), begin(0), end(string::npos); while (sampleLength < sampleMax && itor != occurrencesText.end()) { size_t found(itor->first), length(itor->second); for (size_t index(found); index > begin; index--) { if (found - index >= sampleMax - sampleLength - length) { while (index < found) { if (isspace(getText()[index++])) break; } begin = index; break; } else if ((index > begin ? (isupper(getText()[index]) && !isalnum(getText()[index - 1])) : isupper(getText()[index])) && index != found) { begin = index; break; } } if (end + 1 != begin) sample += " ... "; portion = getText().substr(begin, found - begin); sampleLength += portion.length(); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); sample += portion + ""; portion = getText().substr(found, length); sampleLength += portion.length(); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); sample += portion + ""; begin = found + length; end = begin - 1; if (++itor != occurrencesText.end()) { if (itor->first + itor->second < begin + sampleMax - sampleLength) { portion = getText().substr(begin, itor->first - begin); sampleLength += portion.length(); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); sample += portion; begin = itor->first; end = begin - 1; } else { for (end = begin + sampleMax - sampleLength; end > begin; end--) { if (isspace(getText()[end])) break; } portion = getText().substr(begin, end - begin + 1); sampleLength += portion.length(); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); sample += portion + " ..."; break; } } else { for (end = begin + sampleMax - sampleLength; end > begin && (end + 1 < getText().length()); end--) { if (isspace(getText()[end])) break; } if (end >= getText().length()) end = getText().length() - 1; portion = getText().substr(begin, end - begin + 1); sampleLength += portion.length(); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); sample += portion; if (end + 1 < getText().length()) { sample += " ..."; } break; } } if (sample.empty()) { for (end = sampleMax; end > 0 && (end + 1 < getText().length()); end--) { if (isspace(getText()[end])) break; } sample = getText().substr(0, end + 1); entities(sample, '&', "&"); entities(sample, '\"', """); entities(sample, '<', "<"); entities(sample, '>', ">"); if (end + 1 < getText().length()) { sample += " ..."; } else if (sample.empty()) { sample = "..."; } } } string Ranker::getTitle() { string title, portion; size_t begin(0); for (map::iterator itor = occurrencesTitle.begin(); itor != occurrencesTitle.end(); itor++) { size_t found(itor->first), length(itor->second); portion = Page::getTitle().substr(begin, found - begin); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); title += portion + ""; portion = Page::getTitle().substr(found, length); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); title += portion + ""; begin = found + length; } portion = Page::getTitle().substr(begin); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); title += portion; return title; } string Ranker::getDescription() { string description, portion; size_t begin(0); for (map::iterator itor = occurrencesDescription.begin(); itor != occurrencesDescription.end(); itor++) { size_t found(itor->first), length(itor->second); portion = Page::getDescription().substr(begin, found - begin); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); description += portion + ""; portion = Page::getDescription().substr(found, length); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); description += portion + ""; begin = found + length; } portion = Page::getDescription().substr(begin); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); description += portion; return description; } bool Ranker::operator==(const size_t number) const { return value == number; } bool Ranker::operator==(const Ranker& ranker) const { return value == ranker.value; } bool Ranker::operator!=(const size_t number) const { return value != number; } bool Ranker::operator!=(const Ranker& ranker) const { return value != ranker.value; } bool Ranker::operator<(const size_t number) const { return value < number; } bool Ranker::operator<(const Ranker& ranker) const { return value < ranker.value; } bool Ranker::operator>(const size_t number) const { return value > number; } bool Ranker::operator >(const Ranker& ranker) const { return value > ranker.value; } void Ranker::rank() { lowerAddress = tolower(getAddress()); if (site.empty() || lowerAddress.rfind(site) == lowerAddress.length() - site.length()) { bool isRequired(required.size() > 0), isExcluded(excluded.size() > 0), isEitherOr(eitherOr.size() > 0); lowerURL = tolower(getURL()); lowerTitle = tolower(Page::getTitle()); lowerText = tolower(Page::getText()); if (isRequired) checkRequired(); if (isExcluded && (isRequired || isEitherOr)) checkExcluded(); if (isEitherOr) checkEitherOr(); if (isRequired && isExcluded && isEitherOr) { value += requiredValue && !excludedValue && eitherOrValue ? requiredValue + eitherOrValue : 0; } else if (isRequired && isExcluded) { value += requiredValue && !excludedValue ? requiredValue : 0; } else if (isRequired && isEitherOr) { value += requiredValue && eitherOrValue ? requiredValue + eitherOrValue : 0; } else if (isExcluded && isEitherOr) { value += !excludedValue && eitherOrValue ? eitherOrValue : 0; } else if (isRequired) { value += requiredValue; } else if (isEitherOr) { value += eitherOrValue; } else { // do nothing this is a bad search and warrants no results } if (value > 0) { string lowerDescription(tolower(Page::getDescription())); for (size_t index(0); index < required.size(); index++) { if (required[index].find("URL ") == 0) { value += find(required[index].substr(4), lowerDescription, occurrencesDescription); } else if (required[index].find("TITLE ") == 0) { value += find(required[index].substr(6), lowerDescription, occurrencesDescription); } else if (required[index].find("TEXT ") == 0) { value += find(required[index].substr(5), lowerDescription, occurrencesDescription); } else { value += find(required[index], lowerDescription, occurrencesDescription); } } for (size_t index1(0); index1 < eitherOr.size(); index1++) { vector words; size_t begin(0), found; do { found = eitherOr[index1].find(" OR ", begin); if (found != string::npos) { words.push_back(eitherOr[index1].substr(begin, found - begin)); } else { words.push_back(eitherOr[index1].substr(begin)); } begin = found + 4; } while (begin < eitherOr[index1].length() && found != string::npos); for (size_t number(0); number < words.size(); number++) { if (words[index1].find("URL ") == 0) { value += find(words[index1].substr(4), lowerDescription, occurrencesDescription); } else if (words[index1].find("TITLE ") == 0) { value += find(words[index1].substr(6), lowerDescription, occurrencesDescription); } else if (words[index1].find("TEXT ") == 0) { value += find(words[index1].substr(5), lowerDescription, occurrencesDescription); } else { value += find(words[index1], lowerDescription, occurrencesDescription); } } } for (size_t index2(0); index2 < getHeadings().size(); index2++) { string lowerHeading = string(getHeadings()[index2].length(), ' '); for (size_t number(0); number < getHeadings()[index2].length(); number++) { lowerHeading[number] = tolower( getHeadings()[index2][number]); } for (size_t number0(0); number0 < required.size(); number0++) { if (required[number0].find("URL ") == 0) { value += find(required[number0].substr(4), lowerHeading); } else if (required[number0].find("TITLE ") == 0) { value += find(required[number0].substr(6), lowerHeading); } else if (required[number0].find("TEXT ") == 0) { value += find(required[number0].substr(5), lowerHeading); } else { value += find(required[number0], lowerHeading); } } for (size_t number1(0); number1 < eitherOr.size(); number1++) { vector words; size_t begin(0), found; do { found = eitherOr[number1].find(" OR ", begin); if (found != string::npos) { words.push_back(eitherOr[number1].substr(begin, found - begin)); } else { words.push_back(eitherOr[number1].substr(begin)); } begin = found + 4; } while (begin < eitherOr[number1].length() && found != string::npos); for (size_t number(0); number < words.size(); number++) { if (words[number].find("URL ") == 0) { value += find(words[number].substr(4), lowerHeading); } else if (words[number].find("TITLE ") == 0) { value += find(words[number].substr(6), lowerHeading); } else if (words[number].find("TEXT ") == 0) { value += find(words[number].substr(5), lowerHeading); } else { value += find(words[number], lowerHeading); } } } } } } } void Ranker::checkRequired() { vector inURLs, inTitles, inTexts; for (size_t index(0); index < required.size(); index++) { size_t inURL(0), inTitle(0), inText(0); if (required[index].find("URL ") == 0) { inURL = find(required[index].substr(4), lowerURL.substr(7)); if (inURL) { inTitle = find(required[index].substr(4), lowerTitle, occurrencesTitle); inText = find(required[index].substr(4), lowerText, occurrencesText); if (!inTitle) inTitle++; if (!inText) inText++; } } else if (required[index].find("TITLE ") == 0) { inTitle = find(required[index].substr(6), lowerTitle, occurrencesTitle); if (inTitle) { inURL = find(required[index].substr(6), lowerURL.substr(7)); inText = find(required[index].substr(6), lowerText, occurrencesText); if (!inURL) inURL++; if (!inText) inText++; } } else if (required[index].find("TEXT ") == 0) { inText = find(required[index].substr(5), lowerText, occurrencesText); if (inText) { inURL = find(required[index].substr(5), lowerURL.substr(7)); inTitle = find(required[index].substr(5), lowerTitle, occurrencesTitle); if (!inURL) inURL++; if (!inTitle) inTitle++; } } else { inURL = find(required[index], lowerURL.substr(7)); inTitle = find(required[index], lowerTitle, occurrencesTitle); inText = find(required[index], lowerText, occurrencesText); } inURLs.push_back(inURL); inTitles.push_back(inTitle); inTexts.push_back(inText); } size_t inURL(evaluate(inURLs)), inTitle(evaluate(inTitles)), inText(evaluate(inTexts)); requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title)) || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle + inText : 0; } void Ranker::checkExcluded() { vector inURLs, inTitles, inTexts; for (size_t index(0); index < excluded.size(); index++) { size_t inURL(0), inTitle(0), inText(0); inURL = find(excluded[index], lowerURL.substr(7)); inTitle = find(excluded[index], lowerTitle); inText = find(excluded[index], lowerText); inURLs.push_back(inURL); inTitles.push_back(inTitle); inTexts.push_back(inText); } size_t inURL(evaluate(inURLs)), inTitle = evaluate(inTitles), inText(evaluate(inTexts)); excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title)) || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle + inText : 0; } void Ranker::checkEitherOr() { vector inURLs, inTitles, inTexts; for (size_t index(0); index < eitherOr.size(); index++) { vector inURLz, inTitlez, inTextz; size_t inURL(0), inTitle(0), inText(0); vector words; size_t begin(0), found; do { found = eitherOr[index].find(" OR ", begin); if (found != string::npos) { words.push_back(eitherOr[index].substr(begin, found - begin)); } else { words.push_back(eitherOr[index].substr(begin)); } begin = found + 4; } while (begin < eitherOr[index].length() && found != string::npos); for (size_t number(0); number < words.size(); number++) { size_t inURL(0), inTitle(0), inText(0); if (words[number].find("URL ") == 0) { inURL = find(words[number].substr(4), lowerURL.substr(7)); if (inURL) { inTitle = find(words[number].substr(4), lowerTitle, occurrencesTitle); inText = find(words[number].substr(4), lowerText, occurrencesText); if (!inTitle) inTitle++; if (!inText) inText++; } } else if (words[number].find("TITLE ") == 0) { inTitle = find(words[number].substr(6), lowerTitle, occurrencesTitle); if (inTitle) { inURL = find(words[number].substr(6), lowerURL.substr(7)); inText = find(words[number].substr(6), lowerText, occurrencesText); if (!inURL) inURL++; if (!inText) inText++; } } else if (words[number].find("TEXT ") == 0) { inText = find(words[number].substr(5), lowerText, occurrencesText); if (inText) { inURL = find(words[number].substr(5), lowerURL.substr(7)); inTitle = find(words[number].substr(5), lowerTitle, occurrencesTitle); if (!inURL) inURL++; if (!inTitle) inTitle++; } } else { inURL = find(words[number], lowerURL.substr(7)); inTitle = find(words[number], lowerTitle, occurrencesTitle); inText = find(words[number], lowerText, occurrencesText); } inURLz.push_back(inURL); inTitlez.push_back(inTitle); inTextz.push_back(inText); } for (size_t number0(0); number0 < inURLz.size(); number0++) { inURL += inURLz[number0]; } for (size_t number1(0); number1 < inTitlez.size(); number1++) { inTitle += inTitlez[number1]; } for (size_t number2(0); number2 < inTextz.size(); number2++) { inText += inTextz[number2]; } inURLs.push_back(inURL); inTitles.push_back(inTitle); inTexts.push_back(inText); inURLz.clear(); inTitlez.clear(); inTextz.clear(); words.clear(); } size_t inURL(evaluate(inURLs)), inTitle = evaluate(inTitles), inText(evaluate(inTexts)); eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title)) || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle + inText : 0; } size_t Ranker::find(string word, const string& where) { size_t value(0); decrap(word); if (word.empty()) { // this can happen if a word is all crap characters value++; } else if (word.find_first_of(" \n\t") == string::npos) { size_t begin(0), found; do { found = where.find(word, begin); if (found != string::npos) { bool isBefore, isAfter, before(false), after(false); isBefore = found - 1 > 0; isAfter = found + word.length() < where.length(); if (isBefore) before = isalnum(where[found - 1]) != 0; if (isAfter) after = isalnum(where[found + word.length()]) != 0; if (!before && !after) { value++; } } begin = found + word.length(); } while (found != string::npos && begin < where.length()); } else { value = phrase(word, where); } return value; } size_t Ranker::find(string word, const string& where, map& occurrences) { size_t value(0); decrap(word); if (word.empty()) { // this can happen if a word is all crap characters value++; } else if (word.find_first_of(" \n ") == string::npos) { size_t begin(0), found; do { found = where.find(word, begin); if (found != string::npos) { bool isBefore, isAfter, before(false), after(false); isBefore = found - 1 > 0; isAfter = found + word.length() < where.length(); if (isBefore) before = isalnum(where[found - 1]) != 0; if (isAfter) after = isalnum(where[found + word.length()]) != 0; if (!before && !after) { value++; occurrences.insert(pair(found, word.length())); } } begin = found + word.length(); } while (found != string::npos && begin < where.length()); } else { value = phrase(word, where, occurrences); } return value; } size_t Ranker::phrase(const string& phrase, const string& where) { size_t value(0); vector words; size_t begin(0), space; do { space = phrase.find(' ', begin); words.push_back(phrase.substr(begin, space - begin)); begin = space + 1; } while (space != string::npos && begin < phrase.length()); begin = 0; size_t counter(0); do { value += this->phrase(words, 0, begin, true, where); } while (begin < where.length()); return value; } size_t Ranker::phrase(const string& phrase, const string& where, map& occurrences) { size_t value(0); vector words; size_t begin(0), space; do { space = phrase.find(' ', begin); words.push_back(phrase.substr(begin, space - begin)); begin = space + 1; } while (space != string::npos && begin < phrase.length()); begin = 0; do { value += this->phrase(words, 0, begin, true, where, occurrences); } while (begin < where.length()); return value; } size_t Ranker::phrase(const vector& words, size_t word, size_t& begin, bool start, const string& where) { size_t value(0); bool end(!(word + 1 < words.size())); size_t found(where.find(words[word], begin)), newBegin(found + words[word].length()); if (found != string::npos) { bool isBefore, isAfter, before(false), after(false); isBefore = found - 1 > 0; isAfter = found + words[word].length() < where.length(); if (isBefore) before = isalnum(where[found - 1]) != 0; if (isAfter) after = isalnum(where[found + words[word].length()]) != 0; if (!before && !after) { bool between(true); if (!start) { for (size_t index = begin + 1; index < found - 1; index++) { if (isalnum(where[index])) { between = false; break; } } } if (between) { if (end) { begin = newBegin; value = 1; } else { value = phrase(words, (word + 1), newBegin, false, where); } } } } if (start) { if (found != string::npos) { begin = newBegin; } else { begin = string::npos; } } return value; } size_t Ranker::phrase(const vector& words, size_t word, size_t& begin, bool start, const string& where, map& occurrences) { size_t value(0); bool end(!(word + 1 < words.size())); size_t found(where.find(words[word], begin)), newBegin(found + words[word].length()); if (found != string::npos) { bool isBefore, isAfter, before(false), after(false); isBefore = found - 1 > 0; isAfter = found + words[word].length() < where.length(); if (isBefore) before = isalnum(where[found - 1]) != 0; if (isAfter) after = isalnum(where[found + words[word].length()]) != 0; if (!before && !after) { bool between(true); if (!start) { for (size_t index = begin + 1; index < found - 1; index++) { if (isalnum(where[index])) { between = false; break; } } } if (between) { occurrences.insert(pair(found, words[word].length())); if (end) { begin = newBegin; value = 1; } else { value = phrase(words, (word + 1), newBegin, false, where, occurrences); } } } } if (start) { if (found != string::npos) { begin = newBegin; } else { begin = string::npos; } } return value; } size_t Ranker::evaluate(vector& ins) { size_t in(0); for (size_t index(0); index < ins.size(); index++) { if (ins[index] > 0) { in += ins[index]; } else { in = 0; break; } } return in; } void Ranker::decrap(string& crap) { size_t begin(0), found; do { // &, _, +, and # are not considered crap found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^`{|}~", begin); if (found != string::npos) { crap[found] = ' '; } begin = found + 1; } while (found != string::npos && begin < crap.length()); normalize(crap); }