1#  File src/library/utils/R/aspell.R
2#  Part of the R package, https://www.R-project.org
3#
4#  Copyright (C) 1995-2019 The R Core Team
5#
6#  This program is free software; you can redistribute it and/or modify
7#  it under the terms of the GNU General Public License as published by
8#  the Free Software Foundation; either version 2 of the License, or
9#  (at your option) any later version.
10#
11#  This program is distributed in the hope that it will be useful,
12#  but WITHOUT ANY WARRANTY; without even the implied warranty of
13#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14#  GNU General Public License for more details.
15#
16#  A copy of the GNU General Public License is available at
17#  https://www.R-project.org/Licenses/
18
19aspell <-
20function(files, filter, control = list(), encoding = "unknown",
21         program = NULL, dictionaries = character())
22{
23    ## Take the given files and feed them through spell checker in
24    ## Ispell pipe mode.
25
26    ## Think about options and more command line options eventually.
27
28    program <- aspell_find_program(program)
29    if(is.na(program))
30        stop("No suitable spell-checker program found")
31
32    ## Be nice.
33    if(inherits(files, "Rd"))
34        files <- list(files)
35
36    files_are_names <- is.character(files)
37
38    filter_args <- list()
39    if(missing(filter) || is.null(filter)) {
40        filter <- if(!files_are_names) {
41            function(ifile, encoding) {
42                if(inherits(ifile, "srcfile"))
43                    readLines(ifile$filename, encoding = encoding,
44                              warn = FALSE)
45                else if(inherits(ifile, "connection"))
46                    readLines(ifile, encoding = encoding, warn = FALSE)
47                else {
48                    ## What should this do with encodings?
49                    as.character(ifile)
50                }
51            }
52        }
53        else NULL
54    }
55    else if(is.character(filter)) {
56        ## Look up filter in aspell filter db.
57        filter_name <- filter[1L]
58        filter <- aspell_filter_db[[filter_name]]
59        ## Warn if the filter was not found in the db.
60        if(is.null(filter))
61            warning(gettextf("Filter '%s' is not available.",
62                             filter_name),
63                    domain = NA)
64    }
65    else if(is.list(filter)) {
66        ## Support
67        ##   list("Rd", drop = "\\references"
68        ## at least for now.
69        filter_name <- filter[[1L]][1L]
70        filter_args <- filter[-1L]
71        filter <- aspell_filter_db[[filter_name]]
72        ## Warn if the filter was not found in the db.
73        if(is.null(filter))
74            warning(gettextf("Filter '%s' is not available.",
75                             filter_name),
76                    domain = NA)
77    }
78    else if(!is.function(filter))
79        stop("Invalid 'filter' argument.")
80
81    encoding <- rep_len(encoding, length(files))
82
83    verbose <- getOption("verbose")
84
85    db <- data.frame(Original = character(), File = character(),
86                     Line = integer(), Column = integer(),
87                     stringsAsFactors = FALSE)
88    db$Suggestions <- list()
89
90    tfile <- tempfile("aspell")
91    on.exit(unlink(tfile))
92
93    if(length(dictionaries)) {
94        paths <- aspell_find_dictionaries(dictionaries)
95        ind <- paths == ""
96        if(any(ind)) {
97            warning(gettextf("The following dictionaries were not found:\n%s",
98                             paste(sprintf("  %s", dictionaries[ind]),
99                                   collapse = "\n")),
100                    domain = NA)
101            paths <- paths[!ind]
102        }
103        if(length(paths)) {
104            words <- unlist(lapply(paths, readRDS), use.names = FALSE)
105            personal <- tempfile("aspell_personal")
106            on.exit(unlink(personal), add = TRUE)
107            ## <FIXME>
108            ## How can we get the right language set (if needed)?
109            ## Maybe aspell() needs an additional 'language' arg?
110            aspell_write_personal_dictionary_file(words, personal,
111                                                  program = program)
112            ## </FIXME>
113            control <- c(control, "-p", shQuote(personal))
114        }
115    }
116
117    ## No special expansion of control argument for now.
118    control <- as.character(control)
119
120    fnames <- names(files)
121    files <- as.list(files)
122
123    for (i in seq_along(files)) {
124
125        file <- files[[i]]
126        if(files_are_names)
127            fname <- file
128        else {
129            ## Try srcfiles and srcrefs ...
130            fname <- if(inherits(file, "srcfile"))
131                file$filename
132            else
133                attr(attr(file, "srcref"), "srcfile")$filename
134            ## As a last resort, try the names of the files argument.
135            if(is.null(fname))
136                fname <- fnames[i]
137            ## If unknown ...
138            if(is.null(fname))
139                fname <- "<unknown>"
140        }
141
142        enc <- encoding[i]
143
144        if(verbose)
145            message(gettextf("Processing file %s", fname),
146                    domain = NA)
147
148        lines <- if(is.null(filter))
149            readLines(file, encoding = enc, warn = FALSE)
150        else {
151            ## Assume that filter takes an input file (and additional
152            ## arguments) and return a character vector.
153            do.call(filter, c(list(file, encoding = enc), filter_args))
154        }
155
156        ## Allow filters to pass additional control arguments, in case
157        ## these need to be inferred from the file contents.
158        control <- c(control, attr(lines, "control"))
159
160        ## Need to escape all lines with carets to ensure Aspell handles
161        ## them as data: the Aspell docs say
162        ##   It is recommended that programmatic interfaces prefix every
163        ##   data line with an uparrow to protect themselves against
164        ##   future changes in Aspell.
165        writeLines(paste0("^", lines), tfile)
166        ## Note that this re-encodes character strings with marked
167        ## encodings to the current encoding (which is definitely fine
168        ## if this is UTF-8 and Aspell was compiled with full UTF-8
169        ## support).  Alternatively, we could try using something along
170        ## the lines of
171        ##   writeLines(paste0("^", lines), tfile,
172        ##              useBytes = TRUE)
173        ## and pass the encoding info to Aspell in case we know it.
174
175        out <- tools:::.system_with_capture(program, c("-a", control),
176                                            stdin = tfile)
177
178	if(out$status != 0L)
179	    stop(gettextf("Running aspell failed with diagnostics:\n%s",
180			  paste(out$stderr, collapse = "\n")),
181                 domain = NA)
182
183	## Hopefully everything worked ok.
184	lines <- out$stdout[-1L]
185	pos <- cumsum(lines == "") + 1L
186
187	## Format is as follows.
188	## First line is a header.
189	## Blank lines separate the results for each line.
190	## Results for the word on each line are given as follows.
191	## * If the word was found in the main dictionary, or your personal
192	##   dictionary, then the line contains only a `*'.
193	## * If the word is not in the dictionary, but there are
194	##   suggestions, then the line contains an `&', a space, the
195	##   misspelled word, a space, the number of near misses, the number
196	##   of characters between the beginning of the line and the
197	##   beginning of the misspelled word, a colon, another space, and a
198	##   list of the suggestions separated by commas and spaces.
199	## * If the word does not appear in the dictionary, and there are no
200	##   suggestions, then the line contains a `#', a space, the
201	##   misspelled word, a space, and the character offset from the
202	##   beginning of the line.
203	## This can be summarized as follows:
204	##   OK: *
205	##   Suggestions: & original count offset: miss, miss, ...
206	##   None: # original offset
207
208	## Look at words not in dictionary with suggestions.
209
210	if(any(ind <- startsWith(lines, "&"))) {
211	    info <- strsplit(lines[ind], ": ", fixed = TRUE)
212	    one <- strsplit(sapply(info, `[`, 1L), " ",  fixed = TRUE)
213	    two <- strsplit(sapply(info, `[`, 2L), ", ", fixed = TRUE)
214	    db1 <- data.frame(Original =
215			      as.character(sapply(one, `[`, 2L)),
216			      File = fname,
217			      Line = pos[ind],
218			      Column =
219			      as.integer(sapply(one, `[`, 4L)),
220			      stringsAsFactors = FALSE)
221	    db1$Suggestions <- two
222	    db <- rbind(db, db1)
223	}
224	## Looks at words not in dictionary with no suggestions.
225	if(any(ind <- startsWith(lines, "#"))) {
226	    one <- strsplit(lines[ind], " ", fixed = TRUE)
227	    db1 <- data.frame(Original =
228			      as.character(sapply(one, `[`, 2L)),
229			      File = fname,
230			      Line = pos[ind],
231			      Column =
232			      as.integer(sapply(one, `[`, 3L)),
233			      stringsAsFactors = FALSE)
234	    db1$Suggestions <- vector("list", length(one))
235	    db <- rbind(db, db1)
236	}
237    }
238
239    class(db) <- c("aspell", "data.frame")
240    db
241}
242
243format.aspell <-
244function(x, sort = TRUE, verbose = FALSE, indent = 2L, ...)
245{
246    if(!nrow(x)) return(character())
247
248    if(sort)
249        x <- x[order(x$Original, x$File, x$Line, x$Column), ]
250
251    from <- split(sprintf("%s:%d:%d", x$File, x$Line, x$Column),
252                  x$Original)
253
254    if(verbose) {
255        unlist(Map(function(w, f, s) {
256            sprintf("Word: %s\nFrom: %s\n%s",
257                    w,
258                    paste0(c("", rep.int("      ", length(f) - 1L)),
259                           f, collapse = "\n"),
260                    paste(strwrap(paste("Suggestions:",
261                                        paste(s[[1L]], collapse = " ")),
262                                  exdent = 6L, indent = 0L),
263                          collapse = "\n"))
264        },
265                   names(from),
266                   from,
267                   split(x$Suggestions, x$Original)))
268    } else {
269        sep <- sprintf("\n%s", strrep(" ", indent))
270        paste(names(from),
271              vapply(from, paste, "", collapse = sep),
272              sep = sep)
273    }
274}
275
276print.aspell <-
277function(x, ...)
278{
279    if(nrow(x))
280        writeLines(paste(format(x, ...), collapse = "\n\n"))
281    invisible(x)
282}
283
284summary.aspell <-
285function(object, ...)
286{
287    words <- sort(unique(object$Original))
288    if(length(words)) {
289        writeLines("Possibly mis-spelled words:")
290        print(words)
291    }
292    invisible(words)
293}
294
295aspell_filter_db <- new.env(hash = FALSE) # small
296aspell_filter_db$Rd <- tools::RdTextFilter
297aspell_filter_db$Sweave <- tools::SweaveTeXFilter
298
299aspell_find_program <-
300function(program = NULL)
301{
302    check <- !is.null(program) || !is.null(names(program))
303    if(is.null(program))
304        program <- getOption("aspell_program")
305    if(is.null(program))
306        program <- c("aspell", "hunspell", "ispell")
307    program <- Filter(nzchar, Sys.which(program))[1L]
308    if(!is.na(program) && check) {
309        out <- c(system(sprintf("%s -v", program),
310                        intern = TRUE), "")[1L]
311        if(grepl("really Aspell", out))
312            names(program) <- "aspell"
313        else if(grepl("really Hunspell", out))
314            names(program) <- "hunspell"
315        else if(grepl("International Ispell", out))
316            names(program) <- "ispell"
317        else
318            names(program) <- NA_character_
319    }
320    program
321}
322
323aspell_dictionaries_R <- "en_stats"
324
325aspell_find_dictionaries <-
326function(dictionaries, dirnames = character())
327{
328    dictionaries <- as.character(dictionaries)
329    if(!(n <- length(dictionaries))) return(character())
330
331    ## Always search the R system dictionary directory first.
332    dirnames <- c(file.path(R.home("share"), "dictionaries"), dirnames)
333
334    ## For now, all dictionary files should be .rds files.
335    if(any(ind <- !endsWith(dictionaries, ".rds")))
336        dictionaries[ind] <- sprintf("%s.rds", dictionaries[ind])
337
338    out <- character(n)
339    ## Dictionaries with no path separators are looked for in the given
340    ## dictionary directories (by default, the R system dictionary
341    ## directory).
342    ind <- grepl(.Platform$file.sep, dictionaries, fixed = TRUE)
343    ## (Equivalently, could check where paths == basename(paths).)
344    if(length(pos <- which(ind))) {
345        pos <- pos[file_test("-f", dictionaries[pos])]
346        out[pos] <- normalizePath(dictionaries[pos], "/")
347    }
348    if(length(pos <- which(!ind))) {
349        out[pos] <- find_files_in_directories(dictionaries[pos],
350                                              dirnames)
351    }
352    out
353}
354
355### Utilities.
356
357aspell_inspect_context <-
358function(x)
359{
360    x <- split(x, x$File)
361    y <- Map(function(f, x) {
362        lines <- readLines(f, warn = FALSE)[x$Line]
363        cbind(f,
364              x$Line,
365              substring(lines, 1L, x$Column - 1L),
366              x$Original,
367              substring(lines, x$Column + nchar(x$Original)))
368    },
369             names(x), x)
370    y <- data.frame(do.call(rbind, y), stringsAsFactors = FALSE)
371    names(y) <- c("File", "Line", "Left", "Original", "Right")
372    class(y) <- c("aspell_inspect_context", "data.frame")
373    y
374}
375
376print.aspell_inspect_context <-
377function(x, ...)
378{
379    s <- split(x, x$File)
380    nms <- names(s)
381    for(i in seq_along(s)) {
382        e <- s[[i]]
383        writeLines(c(sprintf("File '%s':", nms[i]),
384                     sprintf("  Line %s: \"%s\", \"%s\", \"%s\"",
385                             format(e$Line),
386                             gsub("\"", "\\\"", e$Left ), e$Original,
387                             gsub("\"", "\\\"", e$Right)),
388                     ""))
389    }
390    invisible(x)
391}
392
393
394## For spell-checking the R manuals:
395
396## This can really only be done with Aspell as the other checkers have
397## no texinfo mode.
398
399aspell_control_R_manuals <-
400    list(aspell =
401         c("--master=en_US",
402           "--add-extra-dicts=en_GB",
403           "--mode=texinfo",
404           "--add-texinfo-ignore=acronym",
405           "--add-texinfo-ignore=deftypefun",
406           "--add-texinfo-ignore=deftypefunx",
407           "--add-texinfo-ignore=findex",
408           "--add-texinfo-ignore=enindex",
409           "--add-texinfo-ignore=include",
410           "--add-texinfo-ignore=ifclear",
411           "--add-texinfo-ignore=ifset",
412           "--add-texinfo-ignore=math",
413           "--add-texinfo-ignore=macro",
414           "--add-texinfo-ignore=multitable",
415           "--add-texinfo-ignore=node",
416           "--add-texinfo-ignore=pkg",
417           "--add-texinfo-ignore=printindex",
418           "--add-texinfo-ignore=set",
419           "--add-texinfo-ignore=vindex",
420           "--add-texinfo-ignore-env=menu",
421           "--add-texinfo-ignore=CRANpkg"
422           ),
423         hunspell =
424         c("-d en_US,en_GB"))
425
426aspell_R_manuals <-
427function(which = NULL, dir = NULL, program = NULL,
428         dictionaries = aspell_dictionaries_R)
429{
430    if(is.null(dir)) dir <- tools:::.R_top_srcdir_from_Rd()
431    ## Allow specifying 'R-exts' and alikes, or full paths.
432    files <- if(is.null(which)) {
433        Sys.glob(file.path(dir, "doc", "manual", "*.texi"))
434    } else {
435        ind <- which(which ==
436                     basename(tools::file_path_sans_ext(which)))
437        which[ind] <-
438            file.path(dir, "doc", "manual",
439                      sprintf("%s.texi", which[ind]))
440        which
441    }
442
443    program <- aspell_find_program(program)
444
445    aspell(files,
446           control = aspell_control_R_manuals[[names(program)]],
447           program = program,
448           dictionaries = dictionaries)
449}
450
451## For spell-checking the R Rd files:
452
453aspell_control_R_Rd_files <-
454    list(aspell =
455         c("--master=en_US",
456           "--add-extra-dicts=en_GB"),
457         hunspell =
458         c("-d en_US,en_GB"))
459
460aspell_R_Rd_files <-
461function(which = NULL, dir = NULL, drop = "\\references",
462         program = NULL, dictionaries = aspell_dictionaries_R)
463{
464    files <- character()
465
466    if(is.null(dir)) dir <- tools:::.R_top_srcdir_from_Rd()
467
468    if(is.null(which)) {
469        which <- tools:::.get_standard_package_names()$base
470        # CHANGES.Rd could be dropped from checks in the future;
471        # it will not be updated post 2.15.0
472        files <- c(file.path(dir, "doc", "NEWS.Rd"),
473                   file.path(dir, "src", "gnuwin32", "CHANGES.Rd"))
474        files <- files[file_test("-f", files)]
475    }
476
477    files <-
478        c(files,
479          unlist(lapply(file.path(dir, "src", "library", which, "man"),
480                        tools::list_files_with_type,
481                        "docs", OS_subdirs = c("unix", "windows")),
482                 use.names = FALSE))
483
484    program <- aspell_find_program(program)
485
486    aspell(files,
487           filter = list("Rd", drop = drop),
488           control = aspell_control_R_Rd_files[[names(program)]],
489           program = program,
490           dictionaries = dictionaries)
491}
492
493## For spell-checking Rd files in a package:
494
495aspell_package_Rd_files <-
496function(dir, drop = c("\\author", "\\references"),
497         control = list(), program = NULL, dictionaries = character())
498{
499    dir <- normalizePath(dir, "/")
500
501    subdir <- file.path(dir, "man")
502    files <- if(dir.exists(subdir))
503        tools::list_files_with_type(subdir,
504                                    "docs",
505                                    OS_subdirs = c("unix", "windows"))
506    else character()
507
508    meta <- tools:::.get_package_metadata(dir, installed = FALSE)
509    if(is.na(encoding <- meta["Encoding"]))
510        encoding <- "unknown"
511
512    defaults <- .aspell_package_defaults(dir, encoding)$Rd_files
513    if(!is.null(defaults)) {
514        ## Direct settings currently override (could add a list add =
515        ## TRUE mechanism eventually).
516        if(!is.null(d <- defaults$drop))
517            drop <- d
518        if(!is.null(d <- defaults$control))
519            control <- d
520        if(!is.null(d <- defaults$program))
521            program <- d
522        if(!is.null(d <- defaults$dictionaries)) {
523            dictionaries <-
524                aspell_find_dictionaries(d, file.path(dir, ".aspell"))
525        }
526        ## <FIXME>
527        ## Deprecated in favor of specifying R level dictionaries.
528        ## Maybe give a warning (in particular if both are given)?
529        if(!is.null(d <- defaults$personal))
530            control <- c(control,
531                         sprintf("-p %s",
532                                 shQuote(file.path(dir, ".aspell", d))))
533        ## </FIXME>
534    }
535
536    macros <- tools::loadPkgRdMacros(dir,
537                                     macros = file.path(R.home("share"),
538                                                        "Rd", "macros",
539                                                        "system.Rd"))
540
541    aspell(files,
542           filter = list("Rd", drop = drop, macros = macros),
543           control = control,
544           encoding = encoding,
545           program = program,
546           dictionaries = dictionaries)
547}
548
549## For spell-checking the R vignettes:
550
551## This should really be done with Aspell as the other checkers have far
552## less powerful TeX modes.
553
554aspell_control_R_vignettes <-
555    list(aspell =
556         c("--mode=tex",
557           "--master=en_US",
558           "--add-extra-dicts=en_GB",
559           "--add-tex-command='code p'",
560           "--add-tex-command='pkg p'",
561           "--add-tex-command='CRANpkg p'"
562           ),
563         hunspell =
564         c("-t", "-d en_US,en_GB"))
565
566aspell_R_vignettes <-
567function(program = NULL, dictionaries = aspell_dictionaries_R)
568{
569    files <- Sys.glob(file.path(tools:::.R_top_srcdir_from_Rd(),
570                                "src", "library", "*", "vignettes",
571                                "*.Rnw"))
572
573    program <- aspell_find_program(program)
574
575    aspell(files,
576           filter = "Sweave",
577           control = aspell_control_R_vignettes[[names(program)]],
578           program = program,
579           dictionaries = dictionaries)
580}
581
582## For spell-checking vignettes in a package:
583
584## This should really be done with Aspell as the other checkers have far
585## less powerful TeX modes.
586
587aspell_control_package_vignettes <-
588    list(aspell =
589         c("--add-tex-command='citep oop'",
590           "--add-tex-command='Sexpr p'",
591           "--add-tex-command='code p'",
592           "--add-tex-command='pkg p'",
593           "--add-tex-command='proglang p'",
594           "--add-tex-command='samp p'"
595           ))
596
597aspell_package_vignettes <-
598function(dir,
599         control = list(), program = NULL, dictionaries = character())
600{
601    dir <- tools::file_path_as_absolute(dir)
602
603    vinfo <- tools::pkgVignettes(dir = dir)
604    files <- vinfo$docs
605    if(!length(files)) return(aspell(character()))
606
607    ## We need the package encoding to read the defaults file ...
608    meta <- tools:::.get_package_metadata(dir, installed = FALSE)
609    if(is.na(encoding <- meta["Encoding"]))
610        encoding <- "unknown"
611
612    defaults <- .aspell_package_defaults(dir, encoding)$vignettes
613    if(!is.null(defaults)) {
614        if(!is.null(d <- defaults$control))
615            control <- d
616        if(!is.null(d <- defaults$program))
617            program <- d
618        if(!is.null(d <- defaults$dictionaries)) {
619            dictionaries <-
620                aspell_find_dictionaries(d, file.path(dir, ".aspell"))
621        }
622        ## <FIXME>
623        ## Deprecated in favor of specifying R level dictionaries.
624        ## Maybe give a warning (in particular if both are given)?
625        if(!is.null(d <- defaults$personal))
626            control <- c(control,
627                         sprintf("-p %s",
628                                 shQuote(file.path(dir, ".aspell", d))))
629        ## </FIXME>
630    }
631
632    program <- aspell_find_program(program)
633
634    fgroups <- split(files, vinfo$engines)
635    egroups <- split(vinfo$encodings, vinfo$engines)
636
637    do.call(rbind,
638            Map(function(fgroup, egroup, engine) {
639                engine <- tools::vignetteEngine(engine)
640                aspell(fgroup,
641                       filter = engine$aspell$filter,
642                       control =
643                       c(engine$aspell$control,
644                         aspell_control_package_vignettes[[names(program)]],
645                         control),
646                       encoding = egroup,
647                       program = program,
648                       dictionaries = dictionaries)
649            },
650                fgroups,
651                egroups,
652                names(fgroups)
653                )
654            )
655}
656
657## Spell-checking R files.
658
659aspell_filter_db$R <-
660function(ifile, encoding = "unknown", ignore = character())
661{
662    pd <- get_parse_data_for_message_strings(ifile, encoding)
663    if(is.null(pd) || !NROW(pd)) return(character())
664
665    ## Strip the string delimiters.
666    pd$text <- substring(pd$text, 2L, nchar(pd$text) - 1L)
667    ## Replace whitespace C backslash escape sequences by whitespace.
668    pd$text <- gsub("(^|[^\\])\\\\[fnrt]", "\\1  ", pd$text)
669    pd$text <- gsub(  "([^\\])\\\\[fnrt]", "\\1  ", pd$text)
670    ## (Do this twice for now because in e.g.
671    ##    \n\t\tInformation on package %s
672    ## the first \t is not matched the first time.  Alternatively, we
673    ## could match with
674    ##    (^|[^\\])((\\\\[fnrt])+)
675    ## but then computing the replacement (\\1 plus as many blanks as
676    ## the characters in \\2) is not straightforward.
677    ## For gettextf() calls, replace basic percent escape sequences by
678    ## whitespace.
679    ind <- pd$caller == "gettextf"
680    if(any(ind)) {
681        pd$text[ind] <-
682            gsub("(^|[^%])%[dioxXfeEgGaAs]", "\\1  ", pd$text[ind])
683        pd$text[ind] <-
684            gsub("  ([^%])%[dioxXfeEgGaAs]", "\\1  ", pd$text[ind])
685        ## (See above for doing this twice.)
686    }
687
688    lines <- readLines(ifile, encoding = encoding, warn = FALSE)
689
690    ## Column positions in the parse data have tabs expanded to tab
691    ## stops using a tab width of 8, so for lines with tabs we need to
692    ## map the column positions back to character positions.
693    lines_in_pd <- sort(unique(c(pd$line1, pd$line2)))
694    tab <- Map(function(tp, nc) {
695        if(tp[1L] == -1L) return(NULL)
696        widths <- rep.int(1, nc)
697        for(i in tp) {
698            cols <- cumsum(widths)
699            widths[i] <- 8 - (cols[i] - 1) %% 8
700        }
701        cumsum(widths)
702    },
703               gregexpr("\t", lines[lines_in_pd], fixed = TRUE),
704               nchar(lines[lines_in_pd]))
705    names(tab) <- lines_in_pd
706
707    lines[lines_in_pd] <- gsub("[^\t]", " ", lines[lines_in_pd])
708    lines[-lines_in_pd] <- ""
709
710    for(entry in split(pd, seq_len(NROW(pd)))) {
711        line1 <- entry$line1
712        line2 <- entry$line2
713        col1 <- entry$col1
714        col2 <- entry$col2
715        if(line1 == line2) {
716            if(length(ptab <- tab[[as.character(line1)]])) {
717                col1 <- which(ptab == col1) + 1L
718                col2 <- which(ptab == col2) - 1L
719            }
720            substring(lines[line1], col1, col2) <- entry$text
721        } else {
722            texts <- unlist(strsplit(entry$text, "\n", fixed = TRUE))
723            n <- length(texts)
724            if(length(ptab <- tab[[as.character(line1)]])) {
725                col1 <- which(ptab == col1) + 1L
726            }
727            substring(lines[line1], col1) <- texts[1L]
728            pos <- seq.int(from = 2L, length.out = n - 2L)
729            if(length(pos))
730                lines[line1 + pos - 1] <- texts[pos]
731            if(length(ptab <- tab[[as.character(line2)]])) {
732                col2 <- which(ptab == col2) - 1L
733            }
734            substring(lines[line2], 1L, col2) <- texts[n]
735        }
736    }
737
738    blank_out_ignores_in_lines(lines, ignore)
739}
740
741get_parse_data_for_message_strings <-
742function(file, encoding = "unknown")
743{
744    ## The message strings considered are the string constants subject to
745    ## translation in gettext-family calls (see below for details).
746
747    exprs <-
748        suppressWarnings(tools:::.parse_code_file(file = file,
749                                                  encoding = encoding,
750                                                  keep.source = TRUE))
751    if(!length(exprs)) return(NULL)
752
753    pd <- getParseData(exprs)
754
755    ## Function for computing grandparent ids.
756    parents <- pd$parent
757    names(parents) <- pd$id
758    gpids <- function(ids)
759        parents[as.character(parents[as.character(ids)])]
760
761    ind <- (pd$token == "SYMBOL_FUNCTION_CALL") &
762        !is.na(match(pd$text,
763                     c("warning", "stop",
764                       "message", "packageStartupMessage",
765                       "gettext", "gettextf", "ngettext")))
766
767    funs <- pd$text[ind]
768
769    ids <- gpids(pd$id[ind])
770    calls <- getParseText(pd, ids)
771
772    table <- pd[pd$token == "STR_CONST", ]
773    ## Could have run into truncation ...
774    table$text <- getParseText(table, table$id)
775    pos <- match(gpids(table$id), ids)
776    ind <- !is.na(pos)
777    table <- split(table[ind, ], factor(pos[ind], seq_along(ids)))
778
779    ## We have synopses
780    ##   message(..., domain = NULL, appendLF = TRUE)
781    ##   packageStartupMessage(..., domain = NULL, appendLF = TRUE)
782    ##   warning(..., call. = TRUE, immediate. = FALSE, domain = NULL)
783    ##   stop(..., call. = TRUE, domain = NULL)
784    ##   gettext(..., domain = NULL)
785    ##   ngettext(n, msg1, msg2, domain = NULL)
786    ##   gettextf(fmt, ..., domain = NULL)
787    ## For the first five, we simply take all unnamed strings.
788    ## (Could make this more precise, of course.)
789    ## For the latter two, we take the msg1/msg2 and fmt arguments,
790    ## provided these are strings.
791
792    ## <NOTE>
793    ## Using domain = NA inhibits translation: perhaps it should
794    ## optionally also inhibit spell checking?
795    ## </NOTE>
796
797    extract_message_strings <- function(fun, call, table) {
798        ## Matching a call containing ... gives
799        ##   Error in match.call(message, call) :
800        ##   ... used in a situation where it doesn't exist
801        ## so eliminate these.
802        ## (Note that we also drop "..." strings.)
803        call <- str2lang(call)
804        call <- call[ as.character(call) != "..." ]
805        mc <- as.list(match.call(get(fun, envir = .BaseNamespaceEnv),
806                                 call))
807        args <- if(fun == "gettextf")
808            mc["fmt"]
809        else if(fun == "ngettext")
810            mc[c("msg1", "msg2")]
811        else {
812            if(!is.null(names(mc)))
813                mc <- mc[!nzchar(names(mc))]
814            mc[-1L]
815        }
816        strings <- as.character(args[vapply(args, is.character, TRUE)])
817        ## Need to canonicalize to match string constants before and
818        ## after parsing ...
819        texts <- vapply(str2expression(table$text), as.character, "")
820        pos <- which(!is.na(match(texts, strings)))
821        cbind(table[pos, ], caller = rep.int(fun, length(pos)))
822    }
823
824    do.call(rbind,
825            Map(extract_message_strings,
826                as.list(funs), as.list(calls), table))
827}
828
829## For spell-checking the R R files.
830
831aspell_R_R_files <-
832function(which = NULL, dir = NULL,
833         ignore = c("[ \t]'[^']*'[ \t[:punct:]]",
834                    "[ \t][[:alnum:]_.]*\\(\\)[ \t[:punct:]]"),
835         program = NULL, dictionaries = aspell_dictionaries_R)
836{
837    if(is.null(dir)) dir <- tools:::.R_top_srcdir_from_Rd()
838    if(is.null(which))
839        which <- tools:::.get_standard_package_names()$base
840
841    files <-
842        unlist(lapply(file.path(dir, "src", "library", which, "R"),
843                      tools::list_files_with_type,
844                      "code",
845                      OS_subdirs = c("unix", "windows")),
846               use.names = FALSE)
847
848    program <- aspell_find_program(program)
849
850    aspell(files,
851           filter = list("R", ignore = ignore),
852           control = aspell_control_R_Rd_files[[names(program)]],
853           program = program,
854           dictionaries = dictionaries)
855}
856
857## For spell-checking R files in a package.
858
859aspell_package_R_files <-
860function(dir, ignore = character(),
861         control = list(), program = NULL, dictionaries = character())
862{
863    dir <- tools::file_path_as_absolute(dir)
864
865    subdir <- file.path(dir, "R")
866    files <- if(dir.exists(subdir))
867        tools::list_files_with_type(subdir,
868                                    "code",
869                                    OS_subdirs = c("unix", "windows"))
870    else character()
871
872    meta <- tools:::.get_package_metadata(dir, installed = FALSE)
873    if(is.na(encoding <- meta["Encoding"]))
874        encoding <- "unknown"
875
876    defaults <- .aspell_package_defaults(dir, encoding)$R_files
877    if(!is.null(defaults)) {
878        if(!is.null(d <- defaults$ignore))
879            ignore <- d
880        if(!is.null(d <- defaults$control))
881            control <- d
882        if(!is.null(d <- defaults$program))
883            program <- d
884        if(!is.null(d <- defaults$dictionaries)) {
885            dictionaries <-
886                aspell_find_dictionaries(d, file.path(dir, ".aspell"))
887        }
888    }
889
890    program <- aspell_find_program(program)
891
892    aspell(files,
893           filter = list("R", ignore = ignore),
894           control = control,
895           encoding = encoding,
896           program = program,
897           dictionaries = dictionaries)
898}
899
900## Spell-checking pot files.
901
902## (Of course, directly analyzing the message strings would be more
903## useful, but require writing appropriate text filters.)
904
905## See also tools:::checkPoFile().
906
907aspell_filter_db$pot <-
908function (ifile, encoding = "unknown", ignore = character())
909{
910    lines <- readLines(ifile, encoding = encoding, warn = FALSE)
911
912    ind <- grepl("^msgid[ \t]", lines)
913
914    do_entry <- function(s) {
915        out <- character(length(s))
916        i <- 1L
917        out[i] <- blank_out_regexp_matches(s[i], "^msgid[ \t]+\"")
918        while(startsWith(s[i <- i + 1L], '"'))
919            out[i] <- sub("^\"", " ", s[i])
920        if(grepl("^msgid_plural[ \t]", s[i])) {
921            out[i] <- blank_out_regexp_matches(s[i], "^msgid_plural[ \t]+\"")
922            while(startsWith(s[i <- i + 1L], '"'))
923                out[i] <- sub("^\"", " ", s[i])
924        }
925        out
926    }
927
928    entries <- split(lines, cumsum(ind))
929    lines <- c(character(length(entries[[1L]])),
930               as.character(do.call(c, lapply(entries[-1L], do_entry))))
931
932    lines <- sub("\"[ \t]*$", " ", lines)
933
934    ## <FIXME>
935    ## Could replace backslash escapes for blanks and percent escapes by
936    ## blanks, similar to what the R text filter does.
937    ## </FIXME>
938
939    blank_out_ignores_in_lines(lines, ignore)
940}
941
942## For spell-checking all pot files in a package.
943
944aspell_package_pot_files <-
945function(dir, ignore = character(),
946         control = list(), program = NULL, dictionaries = character())
947{
948    dir <- tools::file_path_as_absolute(dir)
949    subdir <- file.path(dir, "po")
950    files <- if(dir.exists(subdir))
951        Sys.glob(file.path(subdir, "*.pot"))
952    else character()
953
954    meta <- tools:::.get_package_metadata(dir, installed = FALSE)
955    if(is.na(encoding <- meta["Encoding"]))
956        encoding <- "unknown"
957
958    program <- aspell_find_program(program)
959
960    aspell(files,
961           filter = list("pot", ignore = ignore),
962           control = control,
963           encoding = encoding,
964           program = program,
965           dictionaries = dictionaries)
966}
967
968## For spell-checking the R C files.
969
970aspell_R_C_files <-
971function(which = NULL, dir = NULL,
972         ignore = c("[ \t]'[[:alnum:]_.]*'[ \t[:punct:]]",
973                    "[ \t][[:alnum:]_.]*\\(\\)[ \t[:punct:]]"),
974         program = NULL, dictionaries = aspell_dictionaries_R)
975{
976    if(is.null(dir)) dir <- tools:::.R_top_srcdir_from_Rd()
977    if(is.null(which))
978        which <- tools:::.get_standard_package_names()$base
979    if(!is.na(pos <- match("base", which)))
980        which[pos] <- "R"
981
982    files <- sprintf("%s.pot",
983                     file.path(dir, "src", "library",
984                               which, "po", which))
985    files <- files[file_test("-f", files)]
986
987    program <- aspell_find_program(program)
988
989    aspell(files,
990           filter = list("pot", ignore = ignore),
991           control = aspell_control_R_Rd_files[[names(program)]],
992           program = program,
993           dictionaries = dictionaries)
994}
995
996## For spell-checking package C files.
997
998aspell_package_C_files <-
999function(dir, ignore = character(),
1000         control = list(), program = NULL, dictionaries = character())
1001{
1002    dir <- tools::file_path_as_absolute(dir)
1003    ## Assume that the package C message template file is shipped as
1004    ## 'po/PACKAGE.pot'.
1005    files <- file.path(dir, "po",
1006                       paste(basename(dir), "pot", collapse = "."))
1007    files <- files[file_test("-f", files)]
1008
1009    meta <- tools:::.get_package_metadata(dir, installed = FALSE)
1010    if(is.na(encoding <- meta["Encoding"]))
1011        encoding <- "unknown"
1012
1013    defaults <- .aspell_package_defaults(dir, encoding)$C_files
1014    if(!is.null(defaults)) {
1015        if(!is.null(d <- defaults$ignore))
1016            ignore <- d
1017        if(!is.null(d <- defaults$control))
1018            control <- d
1019        if(!is.null(d <- defaults$program))
1020            program <- d
1021        if(!is.null(d <- defaults$dictionaries)) {
1022            dictionaries <-
1023                aspell_find_dictionaries(d, file.path(dir, ".aspell"))
1024        }
1025    }
1026
1027    program <- aspell_find_program(program)
1028
1029    aspell(files,
1030           filter = list("pot", ignore = ignore),
1031           control = control,
1032           encoding = encoding,
1033           program = program,
1034           dictionaries = dictionaries)
1035}
1036
1037## Spell-checking DCF files.
1038
1039aspell_filter_db$dcf <-
1040function(ifile, encoding, keep = c("Title", "Description"),
1041         ignore = character())
1042{
1043    lines <- readLines(ifile, encoding = encoding, warn = FALSE)
1044    line_has_tags <- grepl("^[^[:blank:]][^:]*:", lines)
1045    tags <- sub(":.*", "", lines[line_has_tags])
1046    lines[line_has_tags] <-
1047        blank_out_regexp_matches(lines[line_has_tags], "^[^:]*:")
1048    lines <- split(lines, cumsum(line_has_tags))
1049    ind <- is.na(match(tags, keep))
1050    lines[ind] <- lapply(lines[ind], function(s) rep.int("", length(s)))
1051    ind <- !ind
1052    lines[ind] <- lapply(lines[ind], paste0, " ")
1053    lines <- unlist(lines, use.names = FALSE)
1054    blank_out_ignores_in_lines(lines, ignore)
1055}
1056
1057## For spell-checking package DESCRIPTION files.
1058
1059aspell_package_description <-
1060function(dir, ignore = character(),
1061         control = list(), program = NULL, dictionaries = character())
1062{
1063    dir <- tools::file_path_as_absolute(dir)
1064    files <- file.path(dir, "DESCRIPTION")
1065
1066    meta <- tools:::.get_package_metadata(dir, installed = FALSE)
1067    if(is.na(encoding <- meta["Encoding"]))
1068        encoding <- "unknown"
1069
1070    ## Allow providing package defaults but make this controllable via
1071    ##   _R_ASPELL_USE_DEFAULTS_FOR_PACKAGE_DESCRIPTION_
1072    ## to safeguard against possible mis-use for CRAN incoming checks.
1073    defaults <-
1074        Sys.getenv("_R_ASPELL_USE_DEFAULTS_FOR_PACKAGE_DESCRIPTION_",
1075                   "TRUE")
1076    defaults <- if(tools:::config_val_to_logical(defaults)) {
1077                    .aspell_package_defaults(dir, encoding)$description
1078                } else NULL
1079    if(!is.null(defaults)) {
1080        if(!is.null(d <- defaults$ignore))
1081            ignore <- d
1082        if(!is.null(d <- defaults$control))
1083            control <- d
1084        if(!is.null(d <- defaults$program))
1085            program <- d
1086        if(!is.null(d <- defaults$dictionaries)) {
1087            dictionaries <-
1088                aspell_find_dictionaries(d, file.path(dir, ".aspell"))
1089        }
1090    }
1091
1092    program <- aspell_find_program(program)
1093
1094    aspell(files,
1095           filter = list("dcf", ignore = ignore),
1096           control = control,
1097           encoding = encoding,
1098           program = program,
1099           dictionaries = dictionaries)
1100}
1101
1102## Spell-checking Markdown files.
1103
1104aspell_filter_db$md <-
1105function(ifile, encoding = "UTF-8")
1106{
1107    x <- readLines(ifile, encoding = encoding, warn = FALSE)
1108    n <- nchar(x)
1109    y <- strrep(rep.int(" ", length(x)), n)
1110    ## Determine positions of 'texts' along the lines of
1111    ## spelling::parse_text_md () by Jeroen Ooms.
1112    md <- commonmark::markdown_xml(x, extensions = TRUE,
1113                                   sourcepos = TRUE)
1114    doc <- xml2::xml_ns_strip(xml2::read_xml(md))
1115    pos <- strsplit(xml2::xml_attr(xml2::xml_find_all(doc,
1116                                                      "//text[@sourcepos]"),
1117                                   "sourcepos"),
1118                    "[:-]")
1119    ## Now use the following idea.
1120    ## Each elt of pos now has positions for l1:c1 to l2:c2.
1121    ## If l1 < l2
1122    ##   Lines in (l1, l2) are taken as a whole
1123    ##   Line l1 from c1 to nchar for l1
1124    ##   Line l2 from  1 to c1
1125    ## otherwise
1126    ##   Line l1 from c1 to c2.
1127    for(p in pos) {
1128        p <- as.integer(p)
1129        ## Legibility ...
1130        l1 <- p[1L]; c1 <- p[2L]; l2 <- p[3L]; c2 <- p[4L]
1131        if(l1 < l2) {
1132            substring(y[l1], c1, n[l1]) <- substring(x[l1], c1, n[l1])
1133            if(l1 + 1L < l2) {
1134                w <- seq.int(from = l1 + 1L, to = l2 - 1L)
1135                y[w] <- x[w]
1136            }
1137            substring(y[l2], 1L, c2) <- substring(x[l2], 1L, c2)
1138        } else {
1139            substring(y[l1], c1, c2) <- substring(x[l1], c1, c2)
1140        }
1141    }
1142    y
1143}
1144
1145## For spell checking packages.
1146
1147aspell_package <-
1148function(dir,
1149         control = list(), program = NULL, dictionaries = character())
1150{
1151    args <- list(dir = dir,
1152                 program = program,
1153                 control = control,
1154                 dictionaries = dictionaries)
1155    a <- rbind(do.call(aspell_package_description, args),
1156               do.call(aspell_package_Rd_files, args),
1157               do.call(aspell_package_vignettes, args),
1158               do.call(aspell_package_R_files, args),
1159               do.call(aspell_package_C_files, args))
1160    if(nrow(a)) {
1161        a$File <- tools:::.file_path_relative_to_dir(a$File,
1162                                                     dirname(dir))
1163    }
1164    a
1165}
1166
1167## For writing personal dictionaries:
1168
1169aspell_write_personal_dictionary_file <-
1170function(x, out, language = "en", program = NULL)
1171{
1172    if(inherits(x, "aspell"))
1173        x <- sort(unique(x$Original))
1174
1175    program <- aspell_find_program(program)
1176    if(is.na(program))
1177        stop("No suitable spell check program found.")
1178
1179    ## <NOTE>
1180    ## Ispell and Hunspell take simple word lists as personal dictionary
1181    ## files, but Aspell requires a special format, see e.g.
1182    ## http://aspell.net/man-html/Format-of-the-Personal-and-Replacement-Dictionaries.html
1183    ## and one has to create these by hand, as
1184    ##   aspell --lang=en create personal ./foo "a b c"
1185    ## gives: Sorry "create/merge personal" is currently unimplemented.
1186
1187    ## Encodings are a nightmare.
1188    ## Try to canonicalize to UTF-8 for Aspell (which allows recording
1189    ## the encoding in the personal dictionary).
1190    ## <FIXME>
1191    ## What should we do for Hunspell (which can handle UTF-8, but has
1192    ## no encoding information in the personal dictionary), or Ispell
1193    ## (which cannot handle UTF-8)?
1194    ## </FIXME>
1195
1196    if(names(program) == "aspell") {
1197        header <- sprintf("personal_ws-1.1 %s %d UTF-8",
1198                          language, length(x))
1199        x <- enc2utf8(x)
1200    }
1201    else {
1202        header <- NULL
1203    }
1204
1205    writeLines(c(header, x), out, useBytes = TRUE)
1206}
1207
1208## For reading package defaults:
1209
1210.aspell_package_defaults <-
1211function(dir, encoding = "unknown")
1212{
1213    dfile <- file.path(dir, ".aspell", "defaults.R")
1214    if(!file_test("-f", dfile))
1215        return(NULL)
1216    exprs <- parse(dfile, encoding = encoding)
1217    envir <- new.env()
1218    for(e in exprs) eval(e, envir)
1219    as.list(envir)
1220}
1221
1222## Utilities.
1223
1224blank_out_regexp_matches <-
1225function(s, re, ...)
1226{
1227    m <- gregexpr(re, s, ...)
1228    regmatches(s, m) <-
1229        Map(function(n) strrep(" ", n),
1230            lapply(regmatches(s, m), nchar))
1231    s
1232}
1233
1234blank_out_ignores_in_lines <-
1235function(lines, ignore)
1236{
1237    args <- list()
1238    if(is.list(ignore)) {
1239        args <- ignore[-1L]
1240        ignore <- ignore[[1L]]
1241    }
1242    for(re in ignore[nzchar(ignore)])
1243        lines <- do.call(blank_out_regexp_matches,
1244                         c(list(lines, re), args))
1245    lines
1246}
1247
1248find_files_in_directories <-
1249function(basenames, dirnames)
1250{
1251    dirnames <- dirnames[dir.exists(dirnames)]
1252    dirnames <- normalizePath(dirnames, "/")
1253
1254    out <- character(length(basenames))
1255    pos <- seq_along(out)
1256
1257    for(dir in dirnames) {
1258        paths <- file.path(dir, basenames[pos])
1259        ind <- file_test("-f", paths)
1260        out[pos[ind]] <- paths[ind]
1261        pos <- pos[!ind]
1262        if(!length(pos)) break
1263    }
1264
1265    out
1266}
1267