1;;; sb-nytimes.el --- shimbun backend for The New York Times 2 3;; Copyright (C) 2007-2010, 2019 Katsumi Yamaoka 4 5;; Author: Katsumi Yamaoka <yamaoka@jpl.org> 6;; Keywords: news 7 8;; This file is a part of shimbun. 9 10;; This program is free software; you can redistribute it and/or modify 11;; it under the terms of the GNU General Public License as published by 12;; the Free Software Foundation; either version 2, or (at your option) 13;; any later version. 14 15;; This program is distributed in the hope that it will be useful, 16;; but WITHOUT ANY WARRANTY; without even the implied warranty of 17;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18;; GNU General Public License for more details. 19 20;; You should have received a copy of the GNU General Public License 21;; along with this program; see the file COPYING. If not, write to 22;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 23;; Boston, MA 02110-1301, USA. 24 25;;; Commentary: 26 27;;; Code: 28 29(require 'shimbun) 30(require 'sb-rss) 31(require 'sb-multi) 32 33(luna-define-class shimbun-nytimes (shimbun-newspaper 34 shimbun-multi shimbun-rss) ()) 35 36(defvar shimbun-nytimes-url "http://www.nytimes.com/" 37 "Name of the parent url.") 38 39(defvar shimbun-nytimes-server-name "The New York Times") 40 41(defvar shimbun-nytimes-group-table 42 '(("homepage" "NYTIMES.COM HOMEPAGE" 43 "http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml") 44 45 ("news.business" "BUSINESS" 46 "http://www.nytimes.com/services/xml/rss/nyt/Business.xml") 47 ("news.business.media&advertising" "Media & Advertising" 48 "http://www.nytimes.com/services/xml/rss/nyt/MediaandAdvertising.xml") 49 ("news.business.worldbusiness" "World Business" 50 "http://www.nytimes.com/services/xml/rss/nyt/WorldBusiness.xml") 51 ("news.business.smallbusiness" "Small Business" 52 "http://www.nytimes.com/services/xml/rss/nyt/SmallBusiness.xml") 53 ("news.business.yourmoney" "Your Money" 54 "http://www.nytimes.com/services/xml/rss/nyt/YourMoney.xml") 55 ("news.business.dealbook" "DealBook" 56 "http://dealbook.blogs.nytimes.com/rss2.xml") 57 58 ("news.education" "EDUCATION" 59 "http://www.nytimes.com/services/xml/rss/nyt/Education.xml") 60 61 ("news.health" "HEALTH" 62 "http://www.nytimes.com/services/xml/rss/nyt/Health.xml") 63 ("news.health.policy" "Health Policy" 64 "http://www.nytimes.com/services/xml/rss/nyt/HealthCarePolicy.xml") 65 ("news.health.psychology" "Mental Health & Behavior" 66 "http://www.nytimes.com/services/xml/rss/nyt/Psychology.xml") 67 68 ("news.world" "WORLD" 69 "http://www.nytimes.com/services/xml/rss/nyt/International.xml") 70 ("news.world.africa" "Africa News" 71 "http://www.nytimes.com/services/xml/rss/nyt/Africa.xml") 72 ("news.world.americas" "Americas News" 73 "http://www.nytimes.com/services/xml/rss/nyt/Americas.xml") 74 ("news.world.asia" "Asia News" 75 "http://www.nytimes.com/services/xml/rss/nyt/AsiaPacific.xml") 76 ("news.world.europe" "Europe News" 77 "http://www.nytimes.com/services/xml/rss/nyt/Europe.xml") 78 ("news.world.middleeast" "Middle East News" 79 "http://www.nytimes.com/services/xml/rss/nyt/MiddleEast.xml") 80 81 ("news.us" "U.S." 82 "http://www.nytimes.com/services/xml/rss/nyt/National.xml") 83 84 ("news.newyork" "NEW YORK / REGION" 85 "http://www.nytimes.com/services/xml/rss/nyt/NYRegion.xml") 86 ("news.newyork.thecity" "The City" 87 "http://www.nytimes.com/services/xml/rss/nyt/TheCity.xml") 88 ("news.newyork.metro" "Metro Campaigns" 89 "http://www.nytimes.com/services/xml/rss/nyt/MetroCampaigns.xml") 90 91 ("news.obituaries" "OBITUARIES" 92 "http://www.nytimes.com/services/xml/rss/nyt/Obituaries.xml") 93 94 ("news.science" "SCIENCE" 95 "http://www.nytimes.com/services/xml/rss/nyt/Science.xml") 96 ("news.science.earth" "Earth" 97 "http://www.nytimes.com/services/xml/rss/nyt/Environment.xml") 98 ("news.science.nutrition" "Nutrition" 99 "http://www.nytimes.com/services/xml/rss/nyt/Nutrition.xml") 100 ("news.science.space" "Space" 101 "http://www.nytimes.com/services/xml/rss/nyt/Space.xml") 102 103 ("news.sports" "SPORTS" 104 "http://www.nytimes.com/services/xml/rss/nyt/Sports.xml") 105 ("news.sports.basketball.college" "College Basketball" 106 "http://www.nytimes.com/services/xml/rss/nyt/CollegeBasketball.xml") 107 ("news.sports.football.college" "College Football" 108 "http://www.nytimes.com/services/xml/rss/nyt/CollegeFootball.xml") 109 ("news.sports.golf" "Golf" 110 "http://www.nytimes.com/services/xml/rss/nyt/Golf.xml") 111 ("news.sports.hockey" "Hockey" 112 "http://www.nytimes.com/services/xml/rss/nyt/Hockey.xml") 113 ("news.sports.other" "Other Sports" 114 "http://www.nytimes.com/services/xml/rss/nyt/OtherSports.xml") 115 ("news.sports.baseball.pro" "Pro Baseball" 116 "http://www.nytimes.com/services/xml/rss/nyt/Baseball.xml") 117 ("news.sports.basketball.pro" "Pro Basketball" 118 "http://www.nytimes.com/services/xml/rss/nyt/ProBasketball.xml") 119 ("news.sports.football.pro" "Pro Football" 120 "http://www.nytimes.com/services/xml/rss/nyt/ProFootball.xml") 121 ("news.sports.soccer" "Soccer" 122 "http://www.nytimes.com/services/xml/rss/nyt/Soccer.xml") 123 124 ("news.technology" "TECHNOLOGY" 125 "http://www.nytimes.com/services/xml/rss/nyt/Technology.xml") 126 ("news.technology.bits" "Bits" 127 "http://bits.blogs.nytimes.com/rss2.xml") 128 ("news.technology.circuits" "Circuits" 129 "http://www.nytimes.com/services/xml/rss/nyt/Circuits.xml") 130 ("news.technology.pogue" "Pogue's Posts" 131 "http://pogue.blogs.nytimes.com/?feed=rss2") 132 133 ("news.washington" "WASHINGTON" 134 "http://www.nytimes.com/services/xml/rss/nyt/Washington.xml") 135 136 ("features.arts" "ARTS" 137 "http://www.nytimes.com/services/xml/rss/nyt/Arts.xml") 138 ("features.arts.design" "Design" 139 "http://www.nytimes.com/services/xml/rss/nyt/ArtandDesign.xml") 140 ("features.arts.music" "Music" 141 "http://www.nytimes.com/services/xml/rss/nyt/Music.xml") 142 ("features.arts.television" "Television News" 143 "http://www.nytimes.com/services/xml/rss/nyt/Television.xml") 144 145 ("features.automobiles" "AUTOMOBILES" 146 "http://www.nytimes.com/services/xml/rss/nyt/Automobiles.xml") 147 148 ("features.books" "BOOKS" 149 "http://www.nytimes.com/services/xml/rss/nyt/Books.xml") 150 ("features.books.review" "Book Review" 151 "http://www.nytimes.com/services/xml/rss/nyt/SundayBookReview.xml") 152 153 ("features.dining&wine" "DINING & WINE" 154 "http://www.nytimes.com/services/xml/rss/nyt/DiningandWine.xml") 155 156 ("features.fashion" "FASHION & STYLE" 157 "http://www.nytimes.com/services/xml/rss/nyt/FashionandStyle.xml") 158 ("features.fashion.thursdaystyles" "Thursday Styles" 159 "http://www.nytimes.com/services/xml/rss/nyt/ThursdayStyles.xml") 160 ("features.fashion.weddings" "Weddings" 161 "http://www.nytimes.com/services/xml/rss/nyt/Weddings.xml") 162 163 ("features.home&garden" "HOME & GARDEN" 164 "http://www.nytimes.com/services/xml/rss/nyt/HomeandGarden.xml") 165 166 ("features.jobs" "JOBS" 167 "http://www.nytimes.com/services/xml/rss/nyt/JobMarket.xml") 168 169 ("features.magazine" "MAGAZINE" 170 "http://www.nytimes.com/services/xml/rss/nyt/Magazine.xml") 171 172 ("features.movie.news" "MOVIE NEWS" 173 "http://www.nytimes.com/services/xml/rss/nyt/MovieNews.xml") 174 175 ("features.movie.reviews" "MOVIE REVIEWS" 176 "http://www.nytimes.com/services/xml/rss/nyt/Movies.xml") 177 178 ("features.realestate" "REAL ESTATE" 179 "http://www.nytimes.com/services/xml/rss/nyt/RealEstate.xml") 180 181 ("features.theater" "THEATER" 182 "http://www.nytimes.com/services/xml/rss/nyt/Theater.xml") 183 184 ("features.travel" "TRAVEL" 185 "http://www.nytimes.com/services/xml/rss/nyt/Travel.xml") 186 ("features.travel.escapes" "Escapes" 187 "http://www.nytimes.com/services/xml/rss/nyt/Escapes.xml") 188 189 ("features.week_in_review" "WEEK IN REVIEW" 190 "http://www.nytimes.com/services/xml/rss/nyt/WeekinReview.xml") 191 192 ("additional.pop_top" "MOST E-MAILED ARTICLES" 193 "http://www.nytimes.com/services/xml/rss/nyt/pop_top.xml") 194 195 ;;("additional.multimedia" "MULTIMEDIA" 196 ;; "http://www.nytimes.com/services/xml/rss/nyt/Multimedia.xml") 197 198 ("opinion.editorial" "EDITORIALS / OP-ED" 199 "http://www.nytimes.com/services/xml/rss/nyt/Opinion.xml"))) 200 201(defvar shimbun-nytimes-x-face-alist 202 '(("default" . "\ 203Face: iVBORw0KGgoAAAANSUhEUgAAAHYAAAAQAgMAAAC+ZGPFAAAADFBMVEVLS0u8vLz///8ICAg 204 XQ6oSAAABe0lEQVQY02OYkJm5atWqZavAwA1Er1i1yjETwl/AUP/5CZDuX/0LSK60qwGS81et+v8 205 /CirNah8DpCer3wJx98YDifWrVor8KYJKp06dA6SX38paApLOB0uvCgvrgkq3XJsza8Wqpb+ylDV 206 TgNIrtWbmL8xyT5u1Kitr6coABo9rcwwna036lHL8+v1M2/gJX43f96x8HmZYeOSWz+QPDCfuzNl 207 b8qqoNtbyevKKv/F9ZaXro1Y89+vrNT153SmB4cS1OX2lWdN6YiOvJ6/0ze8rK1v/a8XztL65ZSd 208 vNh5g+KEW01e2atn62JXXk1f1gqWNVlxJBUmfmmvAYLsUJL1wOZL03pXfloCl0wwY9gvP96vNqte 209 Ojzx+ESjd72Y4Pz7Lxre31/Cr4f4DDJP/Tuibu4o5Mz3LS2pVqO/yrKYFS1f1f5s7t4yrzTmBYWX 210 UqqZVq6TAobBSMxQS1kuzwNSsBQxAkgvIgEj//78KBYClEcBGGK/0qqVo0gCtEBjnqbJU8gAAAAB 211 JRU5ErkJggg=="))) 212 213(defvar shimbun-nytimes-expiration-days 7) 214 215(luna-define-method shimbun-groups ((shimbun shimbun-nytimes)) 216 (mapcar 'car shimbun-nytimes-group-table)) 217 218(luna-define-method shimbun-index-url ((shimbun shimbun-nytimes)) 219 (nth 2 (assoc (shimbun-current-group-internal shimbun) 220 shimbun-nytimes-group-table))) 221 222(defvar shimbun-nytimes-retry-fetching 1) 223 224(defvar shimbun-nytimes-japanese-hankaku 'never) 225 226(luna-define-method shimbun-multi-next-url ((shimbun shimbun-nytimes) 227 header url) 228 (goto-char (point-min)) 229 (when (re-search-forward 230 "<a[\t\n ]+\\([^>]+\\)>[\t\n ]*next[\t\n ]+page[^<]*</a>" 231 nil t) 232 (let ((start (match-beginning 1)) 233 (end (match-end 1))) 234 (goto-char start) 235 (when (and (re-search-forward 236 "class=\"next\"\\|title=\"next[\t\n ]+page\"" 237 end t) 238 (progn 239 (goto-char start) 240 (re-search-forward "href=\"\\([^\"]+\\)\"" end t))) 241 (shimbun-expand-url (match-string 1) url))))) 242 243(luna-define-method shimbun-clear-contents :around ((shimbun shimbun-nytimes) 244 header) 245 (or (shimbun-nytimes-clear-contents shimbun header) 246 (progn 247 (erase-buffer) 248 (insert "<html><body><i>This article may have been expired,\ 249 use the format different from the ordinary style that NYTimes uses,\ 250 or have not been successful to fetch. Sorry.</i></body></html>\n") 251 nil))) 252 253(defun shimbun-nytimes-clear-contents (shimbun header) 254 (shimbun-strip-cr) 255 (let ((start "\ 256\\(?:\ 257\\(?:<p[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*class=\"post-author\"\ 258\\|\\(<div[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*class=\ 259\"\\(?:entry\\|post\\)-content\"\\)\\)\ 260\\(?:[\t\n ]+[^\t\n >]+\\)*[\t\n ]*>\ 261\\|\ 262<NYT_\\(?:BYLINE\\|TEXT\\)\\(?:[\t\n ]*\\|[\t\n ]+[^>]+\\)>\ 263\\)[\t\n ]*") 264 (end "[\t\n ]*\\(\\(<[^>]+>[\t\n ]*\\)*\ 265\\(?:\ 266<!-+[\t\n ]*end[\t\n ]+post-content[\t\n ]*-+>\ 267\\|\ 268<\\(?:/?NYT_UPDATE_BOTTOM\\|/NYT_TEXT\\)\\(?:[\t\n ]+[^>]+\\)?>\ 269\\)\\)") 270 (case-fold-search t) 271 pcont name) 272 (goto-char (point-min)) 273 (when (or (and (re-search-forward start nil t) 274 (progn 275 (save-restriction 276 (setq pcont 277 ;; The marker version of (match-beginning 1). 278 (nth 2 (match-data))) 279 (narrow-to-region (point-min) (match-end 0)) 280 (if (and (search-backward "</NYT_HEADLINE>" nil t) 281 (re-search-forward "\ 282<div[\t\n ]+class=\"image\"" 283 nil t) 284 (progn 285 (setq start (match-beginning 0)) 286 (shimbun-end-of-tag "div"))) 287 (progn 288 (delete-region (match-end 0) (point-max)) 289 (delete-region (point-min) start) 290 (goto-char (point-max))) 291 (delete-region (point-min) (point-max)))) 292 (when (looking-at "</NYT_BYLINE>[\t\n ]*") 293 (delete-region (point-min) (match-end 0))) 294 (or (when (re-search-forward end nil t) 295 (delete-region 296 (if (and (match-beginning 2) 297 (progn 298 (goto-char (match-beginning 1)) 299 (re-search-forward "\ 300\\(?:<[^>]+>\\)*\\(</blockquote>\\|</div>\\|</ul>\\)[\t\n ]*" 301 (match-end 2) t))) 302 (match-end 1) 303 (match-beginning 0)) 304 (point-max)) 305 t) 306 (when (and pcont 307 (progn 308 (goto-char pcont) 309 (insert "<div>") 310 (goto-char pcont) 311 (shimbun-end-of-tag "div" t))) 312 (delete-region (match-end 3) (point-max)) 313 (delete-region (point-min) (match-beginning 3)) 314 t)))) 315 (progn 316 ;; Extract blog listing. 317 (goto-char (point-min)) 318 (when (and (re-search-forward "\ 319<div[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*id=\"blog_comments\"" 320 nil t) 321 (shimbun-end-of-tag "div" t)) 322 (delete-region (match-end 3) (point-max)) 323 (delete-region (point-min) (match-beginning 3)) 324 ;; Remove <ul>. 325 (goto-char (point-min)) 326 (when (re-search-forward "\ 327<ul[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*class=\"commentlist\"" 328 nil t) 329 (cond ((shimbun-end-of-tag "ul" t) 330 (delete-region (goto-char (match-end 3)) 331 (match-end 0)) 332 (insert "\n") 333 (delete-region (goto-char (match-beginning 0)) 334 (match-beginning 3)) 335 (insert "\n")) 336 ((shimbun-end-of-tag nil t) 337 (replace-match "\n")))) 338 ;; Remove useless links. 339 (goto-char (point-min)) 340 (while (and (re-search-forward "\ 341<a[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*href=\"#" 342 nil t) 343 (shimbun-end-of-tag "a")) 344 (replace-match "\\2<br>")) 345 t))) 346 ;; Insert a new line after every image. 347 (goto-char (point-min)) 348 (while (re-search-forward "\\(<img[\t\n ]+[^>]+>\\)[\t\n ]*" nil t) 349 (replace-match "\\1<br>")) 350 ;; Remove the `Skip to next paragraph' buttons. 351 (goto-char (point-min)) 352 (while (re-search-forward "[\t\n ]*\ 353\\(?:<div[\t\n ]+[^>]+>[\t\n ]*\\)*\ 354<a[\t\n ]+href=\"#\\([^\"]+\\)\"[^>]*>[\t\n ]*\ 355Skip[\t\n ]+to[\t\n ]+next[\t\n ]+paragraph[\t\n ]*</a>[\t\n ]*" 356 nil t) 357 (setq start (match-beginning 0) 358 end (match-end 0) 359 name (match-string 1)) 360 (when (re-search-forward (concat "[\t\n ]*<a[\t\n ]+name=\"" 361 (regexp-quote name) 362 "\"[^>]*>[\t\n ]*</a>[\t\n ]*") 363 nil t) 364 ;;(delete-region (match-beginning 0) (match-end 0)) 365 ;; NYTimes is apt to forget to put this. 366 (replace-match "</ul>") 367 (delete-region (goto-char start) end) 368 (insert "\n"))) 369 ;; Remove Next/Previous buttons. 370 (shimbun-remove-tags 371 "\\(div\\)[\t\n ]+\\(?:[^\t\n >]+[\t\n ]+\\)*id=\"pageLinks\"" t) 372 ;; Remove `Enlarge This Image', `Multimedia', and `Video'. 373 (shimbun-remove-tags "\\(div\\)[\t\n ]+\ 374\\(?:class=\"enlargeThis\\|id=\"inlineMultimedia\ 375\\|class=\"inlineVideo\\(?:[\t\n ]+[^\"]+\\)?\\)\"" t) 376 ;; Remove javascripts. 377 (goto-char (point-min)) 378 (while (and (re-search-forward "[\t\n ]*\ 379<a[\t\n ]+href=\"javascript:[^>]+>[\t\n ]*" 380 nil t) 381 (progn 382 (setq start (match-beginning 0) 383 end (match-end 0)) 384 (re-search-forward "[\t\n ]*</a>[\t\n ]*" nil t))) 385 (replace-match "\n") 386 (delete-region (goto-char start) end) 387 (insert "\n")) 388 ;; Remove useless timesselect stuff. 389 (goto-char (point-min)) 390 (while (re-search-forward "[\t\n ]*<img\\(?:[\t\n ]+[^\t\n >]+\\)*\ 391[\t\n ]+src=\"[^\"]*/ts_icon\\.gif\"\\(?:[\t\n ]+[^\t\n >]+\\)*[\t\n ]*>\ 392[\t\n ]*" 393 nil t) 394 (delete-region (match-beginning 0) (match-end 0))) 395 ;; Replace wide apostrophe with the normal one. 396 (goto-char (point-min)) 397 (while (re-search-forward "’\\|’" nil t) 398 (replace-match "'")) 399 ;; Add page delimiters. 400 (goto-char (point-min)) 401 (while (re-search-forward "[\t\n ]*\\(?:<p>[\t\n ]*\\)+\ 402\\(<font[\t\n ]+[^>]+>[\t\n ]*(Page[\t\n ]+[0-9]+[\t\n ]+of[\t\n ]+[0-9]+)\ 403[\t\n ]*</font>\\)\\(?:[\t\n ]*<p>\\)+[\t\n ]*" 404 nil t) 405 (replace-match "\n\\1\n<p>")) 406 ;; Add last newline. 407 (goto-char (point-max)) 408 (unless (bolp) 409 (insert "\n")) 410 t))) 411 412(luna-define-method shimbun-rss-build-message-id :around ((shimbun 413 shimbun-nytimes) 414 url &optional date) 415 ;; Don't strip string following "?" or "#" in url. See sb-rss.el. 416 (concat "<" (md5 url) "%" (shimbun-current-group shimbun) 417 "@" (shimbun-server shimbun) ".shimbun.namazu.org>")) 418 419(luna-define-method shimbun-get-headers :around ((shimbun shimbun-nytimes) 420 &optional range) 421 (let ((name (cadr (assoc (shimbun-current-group-internal shimbun) 422 shimbun-nytimes-group-table))) 423 (apostrophe (condition-case nil 424 (make-char 'japanese-jisx0208 33 71) 425 (error nil))) 426 (headers (luna-call-next-method)) 427 from) 428 (dolist (header headers headers) 429 ;; Show the group name in the From header. 430 (when (and (setq from (shimbun-header-from header)) 431 (string-match "\\`By [A-Z][A-Z]+" from)) 432 (setq from (substring from 3))) 433 (shimbun-header-set-from header (concat from " <" name ">")) 434 ;; Replace wide apostrophe with the normal one in the subject. 435 (when apostrophe 436 (shimbun-header-set-subject 437 header (subst-char-in-string 438 apostrophe ?' (shimbun-header-subject header t))))))) 439 440(provide 'sb-nytimes) 441 442;;; sb-nytimes.el ends here 443