reader.cpp - OpenGrok cross reference for /dports/devel/kiwix-lib/kiwix-lib-9.4.1/src/reader.cpp

/*
 * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU  General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301, USA.
 */

#include "reader.h"
#include <time.h>

#include <zim/search.h>

#include "tools/otherTools.h"

inline char hi(char v)
{
  char hex[] = "0123456789abcdef";
  return hex[(v >> 4) & 0xf];
}

inline char lo(char v)
{
  char hex[] = "0123456789abcdef";
  return hex[v & 0xf];
}

std::string hexUUID(std::string in)
{
  std::ostringstream out;
  for (unsigned n = 0; n < 4; ++n) {
    out << hi(in[n]) << lo(in[n]);
  }
  out << '-';
  for (unsigned n = 4; n < 6; ++n) {
    out << hi(in[n]) << lo(in[n]);
  }
  out << '-';
  for (unsigned n = 6; n < 8; ++n) {
    out << hi(in[n]) << lo(in[n]);
  }
  out << '-';
  for (unsigned n = 8; n < 10; ++n) {
    out << hi(in[n]) << lo(in[n]);
  }
  out << '-';
  for (unsigned n = 10; n < 16; ++n) {
    out << hi(in[n]) << lo(in[n]);
  }
  std::string op = out.str();
  return op;
}

namespace kiwix
{
/* Constructor */
Reader::Reader(const string zimFilePath) : zimFileHandler(NULL)
{
  string tmpZimFilePath = zimFilePath;

  /* Remove potential trailing zimaa */
  size_t found = tmpZimFilePath.rfind("zimaa");
  if (found != string::npos && tmpZimFilePath.size() > 5
      && found == tmpZimFilePath.size() - 5) {
    tmpZimFilePath.resize(tmpZimFilePath.size() - 2);
  }

  this->zimFileHandler = new zim::File(tmpZimFilePath);

  if (this->zimFileHandler != NULL) {
    this->firstArticleOffset
        = this->zimFileHandler->getNamespaceBeginOffset('A');
    this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
    this->nsACount = this->zimFileHandler->getNamespaceCount('A');
    this->nsICount = this->zimFileHandler->getNamespaceCount('I');
    this->zimFilePath = zimFilePath;
  }

  /* initialize random seed: */
  srand(time(NULL));
}

/* Destructor */
Reader::~Reader()
{
  if (this->zimFileHandler != NULL) {
    delete this->zimFileHandler;
  }
}

zim::File* Reader::getZimFileHandler() const
{
  return this->zimFileHandler;
}

MimeCounterType Reader::parseCounterMetadata() const
{
  zim::Article article = this->zimFileHandler->getArticle('M', "Counter");

  if (article.good()) {
    return parseMimetypeCounter(article.getData());
  }

  return MimeCounterType();
}

/* Get the count of articles which can be indexed/displayed */
unsigned int Reader::getArticleCount() const
{
  std::map<const std::string, unsigned int> counterMap
      = this->parseCounterMetadata();
  unsigned int counter = 0;

  if (counterMap.empty()) {
    counter = this->nsACount;
  } else {
    for(auto &pair:counterMap) {
      if (startsWith(pair.first, "text/html")) {
        counter += pair.second;
      }
    }
  }

  return counter;
}

/* Get the count of medias content in the ZIM file */
unsigned int Reader::getMediaCount() const
{
  std::map<const std::string, unsigned int> counterMap
      = this->parseCounterMetadata();
  unsigned int counter = 0;

  if (counterMap.empty()) {
    counter = this->nsICount;
  } else {
    auto it = counterMap.find("image/jpeg");
    if (it != counterMap.end()) {
      counter += it->second;
    }

    it = counterMap.find("image/gif");
    if (it != counterMap.end()) {
      counter += it->second;
    }

    it = counterMap.find("image/png");
    if (it != counterMap.end()) {
      counter += it->second;
    }
  }
  return counter;
}

/* Get the total of all items of a ZIM file, redirects included */
unsigned int Reader::getGlobalCount() const
{
  return this->zimFileHandler->getCountArticles();
}

/* Return the UID of the ZIM file */
string Reader::getId() const
{
  std::ostringstream s;
  s << this->zimFileHandler->getFileheader().getUuid();
  return s.str();
}

/* Return a page url from a title */
bool Reader::getPageUrlFromTitle(const string& title, string& url) const
{
  try {
    auto entry = getEntryFromTitle(title);
    entry = entry.getFinalEntry();
    url = entry.getPath();
    return true;
  } catch (NoEntry& e) {
    return false;
  }
}

/* Return an URL from a title */
string Reader::getRandomPageUrl() const
{
  return getRandomPage().getPath();
}

Entry Reader::getRandomPage() const
{
  if (!this->zimFileHandler) {
    throw NoEntry();
  }

  zim::Article article;
  std::string mainPagePath = this->getMainPage().getPath();
  int watchdog = 42;

  do {
    auto idx = this->firstArticleOffset
          + (zim::size_type)((double)rand() / ((double)RAND_MAX + 1)
                             * this->nsACount);
    article = zimFileHandler->getArticle(idx);
    if (!watchdog--) {
      throw NoEntry();
    }
  } while (!article.good() && article.getLongUrl() == mainPagePath);

  return article;
}

/* Return the welcome page URL */
string Reader::getMainPageUrl() const
{
  return getMainPage().getPath();
}

Entry Reader::getMainPage() const
{
  if (!this->zimFileHandler) {
    throw NoEntry();
  }

  zim::Article article;
  if (this->zimFileHandler->getFileheader().hasMainPage())
  {
    article = zimFileHandler->getArticle(
        this->zimFileHandler->getFileheader().getMainPage());
  }

  if (!article.good())
  {
    return getFirstPage();
  }

  return article;
}

bool Reader::getFavicon(string& content, string& mimeType) const
{
  static const char* const paths[] = {"-/favicon", "-/favicon.png", "I/favicon.png", "I/favicon"};

  for (auto &path: paths) {
    try {
      auto entry = getEntryFromPath(path);
      entry = entry.getFinalEntry();
      content = entry.getContent();
      mimeType = entry.getMimetype();
      return true;
    } catch(NoEntry& e) {};
  }

  return false;
}

string Reader::getZimFilePath() const
{
  return this->zimFilePath;
}
/* Return a metatag value */
bool Reader::getMetadata(const string& name, string& value) const
{
  try {
    auto entry = getEntryFromPath("M/"+name);
    value = entry.getContent();
    return true;
  } catch(NoEntry& e) {
    return false;
  }
}

#define METADATA(NAME) std::string v; getMetadata(NAME, v); return v;

string Reader::getName() const
{
  METADATA("Name")
}

string Reader::getTitle() const
{
  string value;
  this->getMetadata("Title", value);
  if (value.empty()) {
    value = getLastPathElement(zimFileHandler->getFilename());
    std::replace(value.begin(), value.end(), '_', ' ');
    size_t pos = value.find(".zim");
    value = value.substr(0, pos);
  }
  return value;
}

string Reader::getCreator() const
{
  METADATA("Creator")
}

string Reader::getPublisher() const
{
  METADATA("Publisher")
}

string Reader::getDate() const
{
  METADATA("Date")
}

string Reader::getDescription() const
{
  string value;
  this->getMetadata("Description", value);

  /* Mediawiki Collection tends to use the "Subtitle" name */
  if (value.empty()) {
    this->getMetadata("Subtitle", value);
  }

  return value;
}

string Reader::getLongDescription() const
{
  METADATA("LongDescription")
}

string Reader::getLanguage() const
{
  METADATA("Language")
}

string Reader::getLicense() const
{
  METADATA("License")
}

string Reader::getTags(bool original) const
{
  string tags_str;
  getMetadata("Tags", tags_str);
  if (original) {
    return tags_str;
  }
  auto tags = convertTags(tags_str);
  return join(tags, ";");
}


string Reader::getTagStr(const std::string& tagName) const
{
  string tags_str;
  getMetadata("Tags", tags_str);
  return getTagValueFromTagList(convertTags(tags_str), tagName);
}

bool Reader::getTagBool(const std::string& tagName) const
{
  return convertStrToBool(getTagStr(tagName));
}

string Reader::getRelation() const
{
  METADATA("Relation")
}

string Reader::getFlavour() const
{
  METADATA("Flavour")
}

string Reader::getSource() const
{
  METADATA("Source")
}

string Reader::getScraper() const
{
  METADATA("Scraper")
}
#undef METADATA

string Reader::getOrigId() const
{
  string value;
  this->getMetadata("startfileuid", value);
  if (value.empty()) {
    return "";
  }
  std::string id = value;
  std::string origID;
  std::string temp = "";
  unsigned int k = 0;
  char tempArray[16] = "";
  for (unsigned int i = 0; i < id.size(); i++) {
    if (id[i] == '\n') {
      tempArray[k] = atoi(temp.c_str());
      temp = "";
      k++;
    } else {
      temp += id[i];
    }
  }
  origID = hexUUID(tempArray);
  return origID;
}

/* Return the first page URL */
string Reader::getFirstPageUrl() const
{
  return getFirstPage().getPath();
}

Entry Reader::getFirstPage() const
{
  if (!this->zimFileHandler) {
    throw NoEntry();
  }

  auto firstPageOffset = zimFileHandler->getNamespaceBeginOffset('A');
  auto article = zimFileHandler->getArticle(firstPageOffset);

  if (! article.good()) {
    throw NoEntry();
  }

  return article;
}

bool _parseUrl(const string& url, char* ns, string& title)
{
  /* Offset to visit the url */
  unsigned int urlLength = url.size();
  unsigned int offset = 0;

  /* Ignore the first '/' */
  if (url[offset] == '/')
    offset++;

  if (url[offset] == '/' || offset >= urlLength)
    return false;

  /* Get namespace */
  *ns = url[offset++];

  if (url[offset] != '/' || offset >= urlLength)
    return false;

  offset++;

  if ( offset >= urlLength)
    return false;

  /* Get content title */
  title = url.substr(offset, urlLength - offset);

  return true;
}

bool Reader::parseUrl(const string& url, char* ns, string& title) const
{
  return _parseUrl(url, ns, title);
}

Entry Reader::getEntryFromPath(const std::string& path) const
{
  char ns = 0;
  std::string short_url;

  if (!this->zimFileHandler) {
    throw NoEntry();
  }
  _parseUrl(path, &ns, short_url);

  if (short_url.empty() && ns == 0) {
    return getMainPage();
  }

  auto article = zimFileHandler->getArticle(ns, short_url);
  if (!article.good()) {
    throw NoEntry();
  }

  return article;
}

Entry Reader::getEntryFromEncodedPath(const std::string& path) const
{
  return getEntryFromPath(urlDecode(path, true));
}

Entry Reader::getEntryFromTitle(const std::string& title) const
{
  if (!this->zimFileHandler) {
    throw NoEntry();
  }

  auto article = this->zimFileHandler->getArticleByTitle('A', title);
  if (!article.good()) {
    throw NoEntry();
  }

  return article;
}

/* Return article by url */
bool Reader::getArticleObjectByDecodedUrl(const string& url,
                                          zim::Article& article) const
{
  if (this->zimFileHandler == NULL) {
    return false;
  }

  /* Parse the url */
  char ns = 0;
  string urlStr;
  _parseUrl(url, &ns, urlStr);

  /* Main page */
  if (urlStr.empty() && ns == 0) {
    _parseUrl(this->getMainPage().getPath(), &ns, urlStr);
  }

  /* Extract the content from the zim file */
  article = zimFileHandler->getArticle(ns, urlStr);
  return article.good();
}

/* Return the mimeType without the content */
bool Reader::getMimeTypeByUrl(const string& url, string& mimeType) const
{
  try {
    auto entry = getEntryFromPath(url);
    mimeType = entry.getMimetype();
    return true;
  } catch (NoEntry& e) {
    mimeType = "";
    return false;
  }
}

bool get_content_by_decoded_url(const Reader& reader,
                                const string& url,
                                string& content,
                                string& title,
                                unsigned int& contentLength,
                                string& contentType,
                                string& baseUrl)
{
  content = "";
  contentType = "";
  contentLength = 0;

  try {
    auto entry = reader.getEntryFromPath(url);
    entry = entry.getFinalEntry();
    baseUrl = entry.getPath();
    contentType = entry.getMimetype();
    content = entry.getContent();
    contentLength = entry.getSize();
    title = entry.getTitle();

    /* Try to set a stub HTML header/footer if necesssary */
    if (contentType.find("text/html") != string::npos
      && content.find("<body") == std::string::npos
      && content.find("<BODY") == std::string::npos) {
      content = "<html><head><title>" + title +
              "</title><meta http-equiv=\"Content-Type\" content=\"text/html; "
              "charset=utf-8\" /></head><body>" +
              content + "</body></html>";
    }
    return true;
  } catch (NoEntry& e) {
    return false;
  }
}


/* Get a content from a zim file */
bool Reader::getContentByUrl(const string& url,
                             string& content,
                             string& title,
                             unsigned int& contentLength,
                             string& contentType) const
{
  std::string stubRedirectUrl;
  return get_content_by_decoded_url(*this,
                                kiwix::urlDecode(url),
                                content,
                                title,
                                contentLength,
                                contentType,
                                stubRedirectUrl);
}

bool Reader::getContentByEncodedUrl(const string& url,
                                    string& content,
                                    string& title,
                                    unsigned int& contentLength,
                                    string& contentType,
                                    string& baseUrl) const
{
  return get_content_by_decoded_url(*this,
                                kiwix::urlDecode(url),
                                content,
                                title,
                                contentLength,
                                contentType,
                                baseUrl);
}

bool Reader::getContentByEncodedUrl(const string& url,
                                    string& content,
                                    string& title,
                                    unsigned int& contentLength,
                                    string& contentType) const
{
  std::string stubRedirectUrl;
  return get_content_by_decoded_url(*this,
                                kiwix::urlDecode(url),
                                content,
                                title,
                                contentLength,
                                contentType,
                                stubRedirectUrl);
}

bool Reader::getContentByDecodedUrl(const string& url,
                                    string& content,
                                    string& title,
                                    unsigned int& contentLength,
                                    string& contentType) const
{
  std::string stubRedirectUrl;
  return get_content_by_decoded_url(*this,
                                url,
                                content,
                                title,
                                contentLength,
                                contentType,
                                stubRedirectUrl);
}

bool Reader::getContentByDecodedUrl(const string& url,
                                    string& content,
                                    string& title,
                                    unsigned int& contentLength,
                                    string& contentType,
                                    string& baseUrl) const
{
  return get_content_by_decoded_url(*this,
                                url,
                                content,
                                title,
                                contentLength,
                                contentType,
                                baseUrl);
}

/* Check if an article exists */
bool Reader::urlExists(const string& url) const
{
  return pathExists(url);
}

bool Reader::pathExists(const string& path) const
{
  if (!zimFileHandler)
  {
    return false;
  }

  char ns = 0;
  string titleStr;
  _parseUrl(path, &ns, titleStr);
  zim::File::const_iterator findItr = zimFileHandler->find(ns, titleStr);
  return findItr != zimFileHandler->end() && findItr->getUrl() == titleStr;
}

/* Does the ZIM file has a fulltext index */
bool Reader::hasFulltextIndex() const
{
  if (!zimFileHandler || zimFileHandler->is_multiPart() )
  {
    return false;
  }

  return ( pathExists("Z//fulltextIndex/xapian")
        || pathExists("X/fulltext/xapian"));
}

/* Search titles by prefix */

bool Reader::searchSuggestions(const string& prefix,
                               unsigned int suggestionsCount,
                               const bool reset)
{
  /* Reset the suggestions otherwise check if the suggestions number is less
   * than the suggestionsCount */
  if (reset) {
    this->suggestions.clear();
    this->suggestionsOffset = this->suggestions.begin();
  } else {
    if (this->suggestions.size() > suggestionsCount) {
      return false;
    }
  }

  auto ret =  searchSuggestions(prefix, suggestionsCount, this->suggestions);

  /* Set the cursor to the begining */
  this->suggestionsOffset = this->suggestions.begin();

  return ret;
}


bool Reader::searchSuggestions(const string& prefix,
                               unsigned int suggestionsCount,
                               SuggestionsList_t& results)
{
  bool retVal = false;

  /* Return if no prefix */
  if (prefix.size() == 0) {
    return false;
  }

  for (auto articleItr = zimFileHandler->findByTitle('A', prefix);
       articleItr != zimFileHandler->end()
       && articleItr->getTitle().compare(0, prefix.size(), prefix) == 0
       && results.size() < suggestionsCount;
       ++articleItr) {
    /* Extract the interesting part of article title & url */
    std::string normalizedArticleTitle
        = kiwix::normalize(articleItr->getTitle());
    std::string articleFinalUrl = "/A/" + articleItr->getUrl();
    if (articleItr->isRedirect()) {
      zim::Article article = *articleItr;
      unsigned int loopCounter = 0;
      while (article.isRedirect() && loopCounter++ < 42) {
        article = article.getRedirectArticle();
      }
      articleFinalUrl = "/A/" + article.getUrl();
    }

    /* Go through all already found suggestions and skip if this
       article is already in the suggestions list (with an other
       title) */
    bool insert = true;
    std::vector<std::vector<std::string>>::iterator suggestionItr;
    for (suggestionItr = results.begin();
         suggestionItr != results.end();
         suggestionItr++) {
      int result = normalizedArticleTitle.compare((*suggestionItr)[2]);
      if (result == 0 && articleFinalUrl.compare((*suggestionItr)[1]) == 0) {
        insert = false;
        break;
      } else if (result < 0) {
        break;
      }
    }

    /* Insert if possible */
    if (insert) {
      std::vector<std::string> suggestion;
      suggestion.push_back(articleItr->getTitle());
      suggestion.push_back(articleFinalUrl);
      suggestion.push_back(normalizedArticleTitle);
      results.insert(suggestionItr, suggestion);
    }

    /* Suggestions where found */
    retVal = true;
  }

  return retVal;
}

std::vector<std::string> Reader::getTitleVariants(
    const std::string& title) const
{
  std::vector<std::string> variants;
  variants.push_back(title);
  variants.push_back(kiwix::ucFirst(title));
  variants.push_back(kiwix::lcFirst(title));
  variants.push_back(kiwix::toTitle(title));
  return variants;
}


bool Reader::searchSuggestionsSmart(const string& prefix,
                                    unsigned int suggestionsCount)
{
  this->suggestions.clear();
  this->suggestionsOffset = this->suggestions.begin();

  auto ret = searchSuggestionsSmart(prefix, suggestionsCount, this->suggestions);

  this->suggestionsOffset = this->suggestions.begin();

  return ret;
}

/* Try also a few variations of the prefix to have better results */
bool Reader::searchSuggestionsSmart(const string& prefix,
                                    unsigned int suggestionsCount,
                                    SuggestionsList_t& results)
{
  std::vector<std::string> variants = this->getTitleVariants(prefix);
  bool retVal = false;

  /* Try to search in the title using fulltext search database */
  const auto suggestionSearch
      = this->getZimFileHandler()->suggestions(prefix, 0, suggestionsCount);

  if (suggestionSearch->get_matches_estimated()) {
    for (auto current = suggestionSearch->begin();
         current != suggestionSearch->end();
         current++) {
      if (!current->good()) {
          continue;
      }
      std::vector<std::string> suggestion;
      suggestion.push_back(current->getTitle());
      suggestion.push_back("/A/" + current->getUrl());
      suggestion.push_back(kiwix::normalize(current->getTitle()));
      results.push_back(suggestion);
    }
    retVal = true;
  } else {
    for (std::vector<std::string>::iterator variantsItr = variants.begin();
         variantsItr != variants.end();
         variantsItr++) {
      retVal = this->searchSuggestions(*variantsItr, suggestionsCount, results)
               || retVal;
    }
  }

  return retVal;
}

/* Get next suggestion */
bool Reader::getNextSuggestion(string& title)
{
  if (this->suggestionsOffset != this->suggestions.end()) {
    /* title */
    title = (*(this->suggestionsOffset))[0];

    /* increment the cursor for the next call */
    this->suggestionsOffset++;

    return true;
  }

  return false;
}

bool Reader::getNextSuggestion(string& title, string& url)
{
  if (this->suggestionsOffset != this->suggestions.end()) {
    /* title */
    title = (*(this->suggestionsOffset))[0];
    url = (*(this->suggestionsOffset))[1];

    /* increment the cursor for the next call */
    this->suggestionsOffset++;

    return true;
  }

  return false;
}

/* Check if the file has as checksum */
bool Reader::canCheckIntegrity() const
{
  return this->zimFileHandler->getChecksum() != "";
}

/* Return true if corrupted, false otherwise */
bool Reader::isCorrupted() const
{
  try {
    if (this->zimFileHandler->verify() == true) {
      return false;
    }
  } catch (exception& e) {
    cerr << e.what() << endl;
    return true;
  }

  return true;
}

/* Return the file size, works also for splitted files */
unsigned int Reader::getFileSize() const
{
  zim::File* file = this->getZimFileHandler();
  zim::size_type size = 0;

  if (file != NULL) {
    size = file->getFilesize();
  }

  return (size / 1024);
}
}