1;;; sb-nytimes.el --- shimbun backend for The New York Times
2
3;; Copyright (C) 2007-2010, 2019 Katsumi Yamaoka
4
5;; Author: Katsumi Yamaoka <yamaoka@jpl.org>
6;; Keywords: news
7
8;; This file is a part of shimbun.
9
10;; This program is free software; you can redistribute it and/or modify
11;; it under the terms of the GNU General Public License as published by
12;; the Free Software Foundation; either version 2, or (at your option)
13;; any later version.
14
15;; This program is distributed in the hope that it will be useful,
16;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18;; GNU General Public License for more details.
19
20;; You should have received a copy of the GNU General Public License
21;; along with this program; see the file COPYING.  If not, write to
22;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23;; Boston, MA 02110-1301, USA.
24
25;;; Commentary:
26
27;;; Code:
28
29(require 'shimbun)
30(require 'sb-rss)
31(require 'sb-multi)
32
33(luna-define-class shimbun-nytimes (shimbun-newspaper
34				    shimbun-multi shimbun-rss) ())
35
36(defvar shimbun-nytimes-url "http://www.nytimes.com/"
37  "Name of the parent url.")
38
39(defvar shimbun-nytimes-server-name "The New York Times")
40
41(defvar shimbun-nytimes-group-table
42  '(("homepage" "NYTIMES.COM HOMEPAGE"
43     "http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml")
44
45    ("news.business" "BUSINESS"
46     "http://www.nytimes.com/services/xml/rss/nyt/Business.xml")
47    ("news.business.media&advertising" "Media & Advertising"
48     "http://www.nytimes.com/services/xml/rss/nyt/MediaandAdvertising.xml")
49    ("news.business.worldbusiness" "World Business"
50     "http://www.nytimes.com/services/xml/rss/nyt/WorldBusiness.xml")
51    ("news.business.smallbusiness" "Small Business"
52     "http://www.nytimes.com/services/xml/rss/nyt/SmallBusiness.xml")
53    ("news.business.yourmoney" "Your Money"
54     "http://www.nytimes.com/services/xml/rss/nyt/YourMoney.xml")
55    ("news.business.dealbook" "DealBook"
56     "http://dealbook.blogs.nytimes.com/rss2.xml")
57
58    ("news.education" "EDUCATION"
59     "http://www.nytimes.com/services/xml/rss/nyt/Education.xml")
60
61    ("news.health" "HEALTH"
62     "http://www.nytimes.com/services/xml/rss/nyt/Health.xml")
63    ("news.health.policy" "Health Policy"
64     "http://www.nytimes.com/services/xml/rss/nyt/HealthCarePolicy.xml")
65    ("news.health.psychology" "Mental Health & Behavior"
66     "http://www.nytimes.com/services/xml/rss/nyt/Psychology.xml")
67
68    ("news.world" "WORLD"
69     "http://www.nytimes.com/services/xml/rss/nyt/International.xml")
70    ("news.world.africa" "Africa News"
71     "http://www.nytimes.com/services/xml/rss/nyt/Africa.xml")
72    ("news.world.americas" "Americas News"
73     "http://www.nytimes.com/services/xml/rss/nyt/Americas.xml")
74    ("news.world.asia" "Asia News"
75     "http://www.nytimes.com/services/xml/rss/nyt/AsiaPacific.xml")
76    ("news.world.europe" "Europe News"
77     "http://www.nytimes.com/services/xml/rss/nyt/Europe.xml")
78    ("news.world.middleeast" "Middle East News"
79     "http://www.nytimes.com/services/xml/rss/nyt/MiddleEast.xml")
80
81    ("news.us" "U.S."
82     "http://www.nytimes.com/services/xml/rss/nyt/National.xml")
83
84    ("news.newyork" "NEW YORK / REGION"
85     "http://www.nytimes.com/services/xml/rss/nyt/NYRegion.xml")
86    ("news.newyork.thecity" "The City"
87     "http://www.nytimes.com/services/xml/rss/nyt/TheCity.xml")
88    ("news.newyork.metro" "Metro Campaigns"
89     "http://www.nytimes.com/services/xml/rss/nyt/MetroCampaigns.xml")
90
91    ("news.obituaries" "OBITUARIES"
92     "http://www.nytimes.com/services/xml/rss/nyt/Obituaries.xml")
93
94    ("news.science" "SCIENCE"
95     "http://www.nytimes.com/services/xml/rss/nyt/Science.xml")
96    ("news.science.earth" "Earth"
97     "http://www.nytimes.com/services/xml/rss/nyt/Environment.xml")
98    ("news.science.nutrition" "Nutrition"
99     "http://www.nytimes.com/services/xml/rss/nyt/Nutrition.xml")
100    ("news.science.space" "Space"
101     "http://www.nytimes.com/services/xml/rss/nyt/Space.xml")
102
103    ("news.sports" "SPORTS"
104     "http://www.nytimes.com/services/xml/rss/nyt/Sports.xml")
105    ("news.sports.basketball.college" "College Basketball"
106     "http://www.nytimes.com/services/xml/rss/nyt/CollegeBasketball.xml")
107    ("news.sports.football.college" "College Football"
108     "http://www.nytimes.com/services/xml/rss/nyt/CollegeFootball.xml")
109    ("news.sports.golf" "Golf"
110     "http://www.nytimes.com/services/xml/rss/nyt/Golf.xml")
111    ("news.sports.hockey" "Hockey"
112     "http://www.nytimes.com/services/xml/rss/nyt/Hockey.xml")
113    ("news.sports.other" "Other Sports"
114     "http://www.nytimes.com/services/xml/rss/nyt/OtherSports.xml")
115    ("news.sports.baseball.pro" "Pro Baseball"
116     "http://www.nytimes.com/services/xml/rss/nyt/Baseball.xml")
117    ("news.sports.basketball.pro" "Pro Basketball"
118     "http://www.nytimes.com/services/xml/rss/nyt/ProBasketball.xml")
119    ("news.sports.football.pro" "Pro Football"
120     "http://www.nytimes.com/services/xml/rss/nyt/ProFootball.xml")
121    ("news.sports.soccer" "Soccer"
122     "http://www.nytimes.com/services/xml/rss/nyt/Soccer.xml")
123
124    ("news.technology" "TECHNOLOGY"
125     "http://www.nytimes.com/services/xml/rss/nyt/Technology.xml")
126    ("news.technology.bits" "Bits"
127     "http://bits.blogs.nytimes.com/rss2.xml")
128    ("news.technology.circuits" "Circuits"
129     "http://www.nytimes.com/services/xml/rss/nyt/Circuits.xml")
130    ("news.technology.pogue" "Pogue's Posts"
131     "http://pogue.blogs.nytimes.com/?feed=rss2")
132
133    ("news.washington" "WASHINGTON"
134     "http://www.nytimes.com/services/xml/rss/nyt/Washington.xml")
135
136    ("features.arts" "ARTS"
137     "http://www.nytimes.com/services/xml/rss/nyt/Arts.xml")
138    ("features.arts.design" "Design"
139     "http://www.nytimes.com/services/xml/rss/nyt/ArtandDesign.xml")
140    ("features.arts.music" "Music"
141     "http://www.nytimes.com/services/xml/rss/nyt/Music.xml")
142    ("features.arts.television" "Television News"
143     "http://www.nytimes.com/services/xml/rss/nyt/Television.xml")
144
145    ("features.automobiles" "AUTOMOBILES"
146     "http://www.nytimes.com/services/xml/rss/nyt/Automobiles.xml")
147
148    ("features.books" "BOOKS"
149     "http://www.nytimes.com/services/xml/rss/nyt/Books.xml")
150    ("features.books.review" "Book Review"
151     "http://www.nytimes.com/services/xml/rss/nyt/SundayBookReview.xml")
152
153    ("features.dining&wine" "DINING & WINE"
154     "http://www.nytimes.com/services/xml/rss/nyt/DiningandWine.xml")
155
156    ("features.fashion" "FASHION & STYLE"
157     "http://www.nytimes.com/services/xml/rss/nyt/FashionandStyle.xml")
158    ("features.fashion.thursdaystyles" "Thursday Styles"
159     "http://www.nytimes.com/services/xml/rss/nyt/ThursdayStyles.xml")
160    ("features.fashion.weddings" "Weddings"
161     "http://www.nytimes.com/services/xml/rss/nyt/Weddings.xml")
162
163    ("features.home&garden" "HOME & GARDEN"
164     "http://www.nytimes.com/services/xml/rss/nyt/HomeandGarden.xml")
165
166    ("features.jobs" "JOBS"
167     "http://www.nytimes.com/services/xml/rss/nyt/JobMarket.xml")
168
169    ("features.magazine" "MAGAZINE"
170     "http://www.nytimes.com/services/xml/rss/nyt/Magazine.xml")
171
172    ("features.movie.news" "MOVIE NEWS"
173     "http://www.nytimes.com/services/xml/rss/nyt/MovieNews.xml")
174
175    ("features.movie.reviews" "MOVIE REVIEWS"
176     "http://www.nytimes.com/services/xml/rss/nyt/Movies.xml")
177
178    ("features.realestate" "REAL ESTATE"
179     "http://www.nytimes.com/services/xml/rss/nyt/RealEstate.xml")
180
181    ("features.theater" "THEATER"
182     "http://www.nytimes.com/services/xml/rss/nyt/Theater.xml")
183
184    ("features.travel" "TRAVEL"
185     "http://www.nytimes.com/services/xml/rss/nyt/Travel.xml")
186    ("features.travel.escapes" "Escapes"
187     "http://www.nytimes.com/services/xml/rss/nyt/Escapes.xml")
188
189    ("features.week_in_review" "WEEK IN REVIEW"
190     "http://www.nytimes.com/services/xml/rss/nyt/WeekinReview.xml")
191
192    ("additional.pop_top" "MOST E-MAILED ARTICLES"
193     "http://www.nytimes.com/services/xml/rss/nyt/pop_top.xml")
194
195    ;;("additional.multimedia" "MULTIMEDIA"
196    ;; "http://www.nytimes.com/services/xml/rss/nyt/Multimedia.xml")
197
198    ("opinion.editorial" "EDITORIALS / OP-ED"
199     "http://www.nytimes.com/services/xml/rss/nyt/Opinion.xml")))
200
201(defvar shimbun-nytimes-x-face-alist
202  '(("default" . "\
203Face: iVBORw0KGgoAAAANSUhEUgAAAHYAAAAQAgMAAAC+ZGPFAAAADFBMVEVLS0u8vLz///8ICAg
204 XQ6oSAAABe0lEQVQY02OYkJm5atWqZavAwA1Er1i1yjETwl/AUP/5CZDuX/0LSK60qwGS81et+v8
205 /CirNah8DpCer3wJx98YDifWrVor8KYJKp06dA6SX38paApLOB0uvCgvrgkq3XJsza8Wqpb+ylDV
206 TgNIrtWbmL8xyT5u1Kitr6coABo9rcwwna036lHL8+v1M2/gJX43f96x8HmZYeOSWz+QPDCfuzNl
207 b8qqoNtbyevKKv/F9ZaXro1Y89+vrNT153SmB4cS1OX2lWdN6YiOvJ6/0ze8rK1v/a8XztL65ZSd
208 vNh5g+KEW01e2atn62JXXk1f1gqWNVlxJBUmfmmvAYLsUJL1wOZL03pXfloCl0wwY9gvP96vNqte
209 Ojzx+ESjd72Y4Pz7Lxre31/Cr4f4DDJP/Tuibu4o5Mz3LS2pVqO/yrKYFS1f1f5s7t4yrzTmBYWX
210 UqqZVq6TAobBSMxQS1kuzwNSsBQxAkgvIgEj//78KBYClEcBGGK/0qqVo0gCtEBjnqbJU8gAAAAB
211 JRU5ErkJggg==")))
212
213(defvar shimbun-nytimes-expiration-days 7)
214
215(luna-define-method shimbun-groups ((shimbun shimbun-nytimes))
216  (mapcar 'car shimbun-nytimes-group-table))
217
218(luna-define-method shimbun-index-url ((shimbun shimbun-nytimes))
219  (nth 2 (assoc (shimbun-current-group-internal shimbun)
220		shimbun-nytimes-group-table)))
221
222(defvar shimbun-nytimes-retry-fetching 1)
223
224(defvar shimbun-nytimes-japanese-hankaku 'never)
225
226(luna-define-method shimbun-multi-next-url ((shimbun shimbun-nytimes)
227					    header url)
228  (goto-char (point-min))
229  (when (re-search-forward
230	 "<a[\t\n ]+\\([^>]+\\)>[\t\n ]*next[\t\n ]+page[^<]*</a>"
231	 nil t)
232    (let ((start (match-beginning 1))
233	  (end (match-end 1)))
234      (goto-char start)
235      (when (and (re-search-forward
236		  "class=\"next\"\\|title=\"next[\t\n ]+page\""
237		  end t)
238		 (progn
239		   (goto-char start)
240		   (re-search-forward "href=\"\\([^\"]+\\)\"" end t)))
241	(shimbun-expand-url (match-string 1) url)))))
242
243(luna-define-method shimbun-clear-contents :around ((shimbun shimbun-nytimes)
244						    header)
245  (or (shimbun-nytimes-clear-contents shimbun header)
246      (progn
247	(erase-buffer)
248	(insert "<html><body><i>This article may have been expired,\
249 use the format different from the ordinary style that NYTimes uses,\
250 or have not been successful to fetch.  Sorry.</i></body></html>\n")
251	nil)))
252
253(defun shimbun-nytimes-clear-contents (shimbun header)
254  (shimbun-strip-cr)
255  (let ((start "\
256\\(?:\
257\\(?:<p[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*class=\"post-author\"\
258\\|\\(<div[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*class=\
259\"\\(?:entry\\|post\\)-content\"\\)\\)\
260\\(?:[\t\n ]+[^\t\n >]+\\)*[\t\n ]*>\
261\\|\
262<NYT_\\(?:BYLINE\\|TEXT\\)\\(?:[\t\n ]*\\|[\t\n ]+[^>]+\\)>\
263\\)[\t\n ]*")
264	(end "[\t\n ]*\\(\\(<[^>]+>[\t\n ]*\\)*\
265\\(?:\
266<!-+[\t\n ]*end[\t\n ]+post-content[\t\n ]*-+>\
267\\|\
268<\\(?:/?NYT_UPDATE_BOTTOM\\|/NYT_TEXT\\)\\(?:[\t\n ]+[^>]+\\)?>\
269\\)\\)")
270	(case-fold-search t)
271	pcont name)
272    (goto-char (point-min))
273    (when (or (and (re-search-forward start nil t)
274		   (progn
275		     (save-restriction
276		       (setq pcont
277			     ;; The marker version of (match-beginning 1).
278			     (nth 2 (match-data)))
279		       (narrow-to-region (point-min) (match-end 0))
280		       (if (and (search-backward "</NYT_HEADLINE>" nil t)
281				(re-search-forward "\
282<div[\t\n ]+class=\"image\""
283						   nil t)
284				(progn
285				  (setq start (match-beginning 0))
286				  (shimbun-end-of-tag "div")))
287			   (progn
288			     (delete-region (match-end 0) (point-max))
289			     (delete-region (point-min) start)
290			     (goto-char (point-max)))
291			 (delete-region (point-min) (point-max))))
292		     (when (looking-at "</NYT_BYLINE>[\t\n ]*")
293		       (delete-region (point-min) (match-end 0)))
294		     (or (when (re-search-forward end nil t)
295			   (delete-region
296			    (if (and (match-beginning 2)
297				     (progn
298				       (goto-char (match-beginning 1))
299				       (re-search-forward "\
300\\(?:<[^>]+>\\)*\\(</blockquote>\\|</div>\\|</ul>\\)[\t\n ]*"
301							  (match-end 2) t)))
302				(match-end 1)
303			      (match-beginning 0))
304			    (point-max))
305			   t)
306			 (when (and pcont
307				    (progn
308				      (goto-char pcont)
309				      (insert "<div>")
310				      (goto-char pcont)
311				      (shimbun-end-of-tag "div" t)))
312			   (delete-region (match-end 3) (point-max))
313			   (delete-region (point-min) (match-beginning 3))
314			   t))))
315	      (progn
316		;; Extract blog listing.
317		(goto-char (point-min))
318		(when (and (re-search-forward "\
319<div[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*id=\"blog_comments\""
320					      nil t)
321			   (shimbun-end-of-tag "div" t))
322		  (delete-region (match-end 3) (point-max))
323		  (delete-region (point-min) (match-beginning 3))
324		  ;; Remove <ul>.
325		  (goto-char (point-min))
326		  (when (re-search-forward "\
327<ul[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*class=\"commentlist\""
328					   nil t)
329		    (cond ((shimbun-end-of-tag "ul" t)
330			   (delete-region (goto-char (match-end 3))
331					  (match-end 0))
332			   (insert "\n")
333			   (delete-region (goto-char (match-beginning 0))
334					  (match-beginning 3))
335			   (insert "\n"))
336			  ((shimbun-end-of-tag nil t)
337			   (replace-match "\n"))))
338		  ;; Remove useless links.
339		  (goto-char (point-min))
340		  (while (and (re-search-forward "\
341<a[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*href=\"#"
342						 nil t)
343			      (shimbun-end-of-tag "a"))
344		    (replace-match "\\2<br>"))
345		  t)))
346      ;; Insert a new line after every image.
347      (goto-char (point-min))
348      (while (re-search-forward "\\(<img[\t\n ]+[^>]+>\\)[\t\n ]*" nil t)
349	(replace-match "\\1<br>"))
350      ;; Remove the `Skip to next paragraph' buttons.
351      (goto-char (point-min))
352      (while (re-search-forward "[\t\n ]*\
353\\(?:<div[\t\n ]+[^>]+>[\t\n ]*\\)*\
354<a[\t\n ]+href=\"#\\([^\"]+\\)\"[^>]*>[\t\n ]*\
355Skip[\t\n ]+to[\t\n ]+next[\t\n ]+paragraph[\t\n ]*</a>[\t\n ]*"
356				nil t)
357	(setq start (match-beginning 0)
358	      end (match-end 0)
359	      name (match-string 1))
360	(when (re-search-forward (concat "[\t\n ]*<a[\t\n ]+name=\""
361					 (regexp-quote name)
362					 "\"[^>]*>[\t\n ]*</a>[\t\n ]*")
363				 nil t)
364	  ;;(delete-region (match-beginning 0) (match-end 0))
365	  ;; NYTimes is apt to forget to put this.
366	  (replace-match "</ul>")
367	  (delete-region (goto-char start) end)
368	  (insert "\n")))
369      ;; Remove Next/Previous buttons.
370      (shimbun-remove-tags
371       "\\(div\\)[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*id=\"pageLinks\"" t)
372      ;; Remove `Enlarge This Image', `Multimedia', and `Video'.
373      (shimbun-remove-tags "\\(div\\)[\t\n ]+\
374\\(?:class=\"enlargeThis\\|id=\"inlineMultimedia\
375\\|class=\"inlineVideo\\(?:[\t\n ]+[^\"]+\\)?\\)\"" t)
376      ;; Remove javascripts.
377      (goto-char (point-min))
378      (while (and (re-search-forward "[\t\n ]*\
379<a[\t\n ]+href=\"javascript:[^>]+>[\t\n ]*"
380				     nil t)
381		  (progn
382		    (setq start (match-beginning 0)
383			  end (match-end 0))
384		    (re-search-forward "[\t\n ]*</a>[\t\n ]*" nil t)))
385	(replace-match "\n")
386	(delete-region (goto-char start) end)
387	(insert "\n"))
388      ;; Remove useless timesselect stuff.
389      (goto-char (point-min))
390      (while (re-search-forward "[\t\n ]*<img\\(?:[\t\n ]+[^\t\n >]+\\)*\
391[\t\n ]+src=\"[^\"]*/ts_icon\\.gif\"\\(?:[\t\n ]+[^\t\n >]+\\)*[\t\n ]*>\
392[\t\n ]*"
393				nil t)
394	(delete-region (match-beginning 0) (match-end 0)))
395      ;; Replace wide apostrophe with the normal one.
396      (goto-char (point-min))
397      (while (re-search-forward "&#8217;\\|&#x2019;" nil t)
398	(replace-match "&#39;"))
399      ;; Add page delimiters.
400      (goto-char (point-min))
401      (while (re-search-forward "[\t\n ]*\\(?:<p>[\t\n ]*\\)+\
402\\(<font[\t\n ]+[^>]+>[\t\n ]*(Page[\t\n ]+[0-9]+[\t\n ]+of[\t\n ]+[0-9]+)\
403[\t\n ]*</font>\\)\\(?:[\t\n ]*<p>\\)+[\t\n ]*"
404				nil t)
405	(replace-match "\n&#012;\\1\n<p>"))
406      ;; Add last newline.
407      (goto-char (point-max))
408      (unless (bolp)
409	(insert "\n"))
410      t)))
411
412(luna-define-method shimbun-rss-build-message-id :around ((shimbun
413							   shimbun-nytimes)
414							  url &optional date)
415  ;; Don't strip string following "?" or "#" in url.  See sb-rss.el.
416  (concat "<" (md5 url) "%" (shimbun-current-group shimbun)
417	  "@" (shimbun-server shimbun) ".shimbun.namazu.org>"))
418
419(luna-define-method shimbun-get-headers :around ((shimbun shimbun-nytimes)
420						 &optional range)
421  (let ((name (cadr (assoc (shimbun-current-group-internal shimbun)
422			   shimbun-nytimes-group-table)))
423	(apostrophe (condition-case nil
424			(make-char 'japanese-jisx0208 33 71)
425		      (error nil)))
426	(headers (luna-call-next-method))
427	from)
428    (dolist (header headers headers)
429      ;; Show the group name in the From header.
430      (when (and (setq from (shimbun-header-from header))
431		 (string-match "\\`By [A-Z][A-Z]+" from))
432	(setq from (substring from 3)))
433      (shimbun-header-set-from header (concat from " <" name ">"))
434      ;; Replace wide apostrophe with the normal one in the subject.
435      (when apostrophe
436	(shimbun-header-set-subject
437	 header (subst-char-in-string
438		 apostrophe ?' (shimbun-header-subject header t)))))))
439
440(provide 'sb-nytimes)
441
442;;; sb-nytimes.el ends here
443