1 /************************************************************************
2 **
3 **  Copyright (C) 2016 Kevin B. Hendricks Stratford, ON, Canada
4 **
5 **  This file is part of Sigil.
6 **
7 **  Sigil is free software: you can redistribute it and/or modify
8 **  it under the terms of the GNU General Public License as published by
9 **  the Free Software Foundation, either version 3 of the License, or
10 **  (at your option) any later version.
11 **
12 **  Sigil is distributed in the hope that it will be useful,
13 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 **  GNU General Public License for more details.
16 **
17 **  You should have received a copy of the GNU General Public License
18 **  along with Sigil.  If not, see <http://www.gnu.org/licenses/>.
19 **
20 *************************************************************************/
21 
22 #include <QMutex>
23 #include <QString>
24 #include <QStringList>
25 
26 #include "BookManipulation/HTMLMetadata.h"
27 
28 static const QStringList EVENT_LIST           = QStringList() << "creation" << "publication" << "modification";
29 static const QStringList MODIFICATION_ALIASES = QStringList() << "modified" << "modification";
30 static const QStringList CREATION_ALIASES     = QStringList() << "created"  << "creation";
31 static const QStringList PUBLICATION_ALIASES  = QStringList() << "issued"   << "published" << "publication";
32 static const QStringList SCHEME_LIST          = QStringList() << "ISBN" << "ISSN" << "DOI";
33 
34 QMutex HTMLMetadata::s_AccessMutex;
35 HTMLMetadata *HTMLMetadata::m_Instance = NULL;
36 
Instance()37 HTMLMetadata *HTMLMetadata::Instance()
38 {
39     // We use a static local variable
40     // to hold our singleton instance; using a pointer member
41     // variable creates problems with object destruction;
42     QMutexLocker locker(&s_AccessMutex);
43 
44     if (m_Instance == 0) {
45         m_Instance = new HTMLMetadata();
46     }
47 
48     return m_Instance;
49 }
50 
51 
52 
53 // Processes metadata from inside xhtml files for the gui
54 // try to extract whatever dc metadata possible
MapHTMLToOPFMetadata(GumboNode * node,GumboInterface & gi)55 MetaEntry HTMLMetadata::MapHTMLToOPFMetadata(GumboNode* node, GumboInterface & gi)
56 {
57     MetaEntry meta;
58     if (node->v.element.tag == GUMBO_TAG_META) {
59         QString name;
60         QString value;
61         QHash<QString,QString> matts;
62 
63         GumboAttribute* attr = gumbo_get_attribute(&node->v.element.attributes, "name");
64         if (attr) name = QString::fromUtf8(attr->value);
65 
66         attr = gumbo_get_attribute(&node->v.element.attributes, "content");
67         if (attr) value = QString::fromUtf8(attr->value);
68 
69         attr = gumbo_get_attribute(&node->v.element.attributes, "scheme");
70         if (attr) matts[QString("scheme")] = QString::fromUtf8(attr->value);
71 
72         attr = gumbo_get_attribute(&node->v.element.attributes, "opf:scheme");
73         if (attr) matts[QString("opf:scheme")] = QString::fromUtf8(attr->value);
74 
75         attr = gumbo_get_attribute(&node->v.element.attributes, "id");
76         if (attr) matts[QString("id")] = QString::fromUtf8(attr->value);
77 
78         if ((!name.isEmpty()) && (!value.isEmpty())) {
79             meta = FixupHTMLMetadata(name, value, matts);
80         }
81     }
82     return meta;
83 }
84 
85 
86 // Maps Dublic Core metadata to internal book meta format
FixupHTMLMetadata(QString name,QString value,const QHash<QString,QString> & matts)87 MetaEntry HTMLMetadata::FixupHTMLMetadata(QString name, QString value, const QHash<QString,QString> & matts)
88 {
89     QString lowname = name.toLower();
90     MetaEntry me;
91 
92     if (!lowname.startsWith("dc.") &&
93         !lowname.startsWith("dcterms.")) {
94         if (lowname == "copyright") {
95             me.m_name = "dc:rights";
96             me.m_content = value;
97         } else if (lowname == "author") {
98             me.m_name = "dc:creator";
99             me.m_content = value;
100         } else if (lowname == "publisher") {
101             me.m_name = "dc:publisher";
102             me.m_content = value;
103         } else if (lowname == "source") {
104             me.m_name = "dc:source";
105             me.m_content = value;
106         } else if (lowname == "description") {
107             me.m_name = "dc:description";
108             me.m_content = value;
109         } else if (lowname == "date" || lowname == "published") {
110             me.m_name = "dc:date";
111             me.m_content = value;
112         } else if (lowname  == "eisbn") {
113             me.m_name = "dc:identifier";
114             me.m_content = "urn:isbn:" + value;
115         } else if (lowname  == "issn") {
116             me.m_name = "dc:identifier";
117             me.m_content = "urn:issn:" + value;
118         } else if (lowname == "doi") {
119             me.m_name = "dc:identifier";
120             me.m_content = "urn:doi:" + value;
121         }
122         return me;
123     }
124     return HtmlToOpfDC(name, value, matts);
125 }
126 
127 
128 
129 // Converts HTML sourced Dublin Core metadata to OPF style metadata
130 //
131 // Sample of HTML Based DC Metadata:
132 //   <meta name="DC.Title" content="The Title"/>
133 //   <meta name="DC.Language" content="en"/>
134 //   <meta name="DC.Creator" content=""/>
135 //   <meta name="DC.Publisher" content="Publisher Name"/>
136 //   <meta name="DC.Date" content="2016-03-01"/>
137 //   <meta name="DC.Identifier" content="978-0-00000-000-0" scheme="ISBN"/>
138 //   <meta name="DC.Relation" content="978-0-00000-000-0" scheme="ISBN"/>
139 
HtmlToOpfDC(QString mname,QString mvalue,const QHash<QString,QString> & matts)140 MetaEntry HTMLMetadata::HtmlToOpfDC(QString mname, QString mvalue, const QHash<QString,QString> & matts)
141 {
142     // Dublin Core from html file with the original 15 element namespace or
143     // expanded DCTerms namespace. Allows qualifiers as refinements
144     // prefix.name[.refinement]
145     QStringList fields = QString(mname.toLower() + "..").split(".");
146     QString name       = fields[ 1 ];
147     QString refinement = fields[ 2 ];
148     QString dc_event;
149 
150     if (MODIFICATION_ALIASES.contains(name) || MODIFICATION_ALIASES.contains(refinement)) {
151         name = "dc:date";
152         dc_event = "modification";
153     } else if (CREATION_ALIASES.contains(name) || CREATION_ALIASES.contains(refinement)) {
154         name     = "dc:date";
155         dc_event = "creation";
156     } else if (PUBLICATION_ALIASES.contains(name) || PUBLICATION_ALIASES.contains(refinement)) {
157         name     = "dc:date";
158         dc_event = "publication";
159     }
160 
161     QString role   = (name == "creator") || (name == "contributor") ? refinement : QString();
162 
163     QString scheme = matts.value("scheme");
164     if ((name == "identifier") && (scheme.isEmpty())) {
165         scheme = refinement;
166     }
167     if (!scheme.isEmpty()) {
168         if (SCHEME_LIST.contains(scheme, Qt::CaseInsensitive)) {
169             scheme = SCHEME_LIST.filter(scheme, Qt::CaseInsensitive)[ 0 ];
170         }
171     }
172 
173     MetaEntry me;
174     me.m_name  = "dc:" + name;
175     me.m_content = mvalue;
176     if (!scheme.isEmpty()) {
177         me.m_atts[ "opf:scheme" ] = scheme;
178     }
179     if (!dc_event.isEmpty()) {
180         me.m_atts[ "opf:event" ] = dc_event;
181     }
182     if (!role.isEmpty()) {
183         me.m_atts[ "opf:role" ] = role;
184     }
185     return me;
186 }
187