1## $Id: sas.get.s 827 2012-10-21 13:13:34Z harrelfe $ 2sas.get <- 3 function(libraryName, 4 member, 5 variables = character(0), 6 ifs = character(0), 7 format.library = libraryName, 8 id, 9 dates. = c("sas","yymmdd","yearfrac","yearfrac2"), 10 keep.log = TRUE, 11 log.file = "_temp_.log", 12 macro = sas.get.macro, 13 data.frame.out = existsFunction("data.frame"), 14 clean.up = FALSE, 15 quiet = FALSE, 16 temp = tempfile("SaS"), 17 formats=TRUE, 18 recode=formats, 19 special.miss=FALSE, 20 sasprog="sas", 21 as.is=.5, 22 check.unique.id=TRUE, 23 force.single=FALSE, 24 where, 25 uncompress=FALSE) 26{ 27 if(force.single) stop('force.single does not work under R') 28 dates. <- match.arg(dates.) 29 30 fexists <- function(name) { 31 w <- file.exists(name) 32 attr(w, 'which') <- name[w] 33 w 34 } 35 36 file.is.dir <- function(name) { 37 isdir <- file.info(name)$isdir 38 isdir && !is.na(isdir) 39 } 40 41 file.is.readable <- function(name) file.access(name,4)==0 42 43 fileShow <- function(x) file.show(x) 44 45 if(recode) formats <- TRUE 46 47 if(missing(formats) || formats) { 48 ## ***** Next line begins mod from Mike Kattan edits 11 Sep 97 49 ## Redone FEH 22Oct00 50 no.format <- all(!fexists(file.path(format.library, 51 c('formats.sc2','formats.sct','formats.sct01','formats.sas7bcat')))) 52 if(no.format) { 53 if((!missing(formats) && formats) || (!missing(recode) && recode)) 54 warning(paste(paste(format.library, 55 "/formats.sc? or formats.sas7bcat",sep = ""), 56 " not found. Formatting ignored. \n")) 57 formats <- recode <- FALSE 58 } 59 ## ***** End Mike Kattan edits 11 Sep 97 60 } 61 62 ## 5 Changes here from Claudie Berger <claudie@osteo1.ri.mgh.mcgill.ca> 19feb00 63 ## Allows work on sas v7. 64 sasin <- paste(temp, ".3.sas", sep = "") 65 sasout1 <- paste(temp, ".1.sas", sep = "") 66 sasout2 <- paste(temp, ".2.sas", sep = "") 67 sasout3 <- paste(temp, ".4.sas", sep = "") 68 sasout4 <- paste(temp, ".5.sas", sep = "") 69 nvariables <- length(variables) 70 if(nvariables>0) { 71 if(any(jdup <- duplicated(variables))) 72 stop(paste("duplicate variables requested: ", variables[jdup])) 73 } 74 75 varstring <- paste(variables, collapse = "\n ") 76 ifs <- paste("'",paste(ifs, collapse = ";\n "),"'",sep="") 77 if(length(sasin) != 1) 78 stop("Illegal temporary file name") 79 80 temp.files <- c(sasin, sasout1, sasout2, sasout3, sasout4) 81 if(!keep.log) 82 temp.files <- c(temp.files, log.file) 83 84 if(clean.up) 85 on.exit(unlink(temp.files)) 86 ##on.exit(sys(paste("rm -f", paste(temp.files, collapse = " ")))) 87 ## 4oct03 88 89 if(missing(member)) 90 stop("SAS member name is required") 91 92 if(missing(libraryName)) 93 stop("SAS library name is required") 94 95 cat(macro, sep="\n", file=sasin) 96 97 sasds.suffix <- c('sd2','sd7','ssd01','ssd02','ssd03','ssd04','sas7bdat') 98 ## 22Oct00 99 100 if(libraryName == "") { 101 if(uncompress) { # 22Oct00 102 unix.file <- paste(member, sasds.suffix, sep=".") 103 if(any(fe <- fexists(paste(unix.file,".gz",sep="")))) 104 system(paste("gunzip ",attr(fe,'which'),'.gz',sep='')) 105 else if(any(fe <- fexists(paste(unix.file,".Z",sep="")))) 106 system(paste("uncompress ",attr(fe,'which'),'.Z',sep='')) 107 } 108 109 cat("%sas_get(", member, ",\n", 110 " ", sasout1, ",\n", 111 " ", sasout2, ",\n", 112 " ", sasout3, ",\n", 113 " ", sasout4, ",\n", 114 " dates=", dates., ",\n", 115 " vars=", varstring, ",\n", 116 " ifs=", ifs, ",\n", 117 " formats=", as.integer(formats), "\n,", 118 " specmiss=", as.integer(special.miss), ");\n", 119 file = sasin, append = TRUE, sep = "") 120 } else { 121 if(!file.is.dir(libraryName)) 122 stop(paste(sep = "", "library, \"", libraryName, 123 "\", is not a directory")) 124 125 unix.file <- file.path(libraryName, paste(member, sasds.suffix, sep=".")) 126 127 ##23Nov00 128 if(uncompress) { #22Oct00 129 if(any(fe <- fexists(paste(unix.file,".gz",sep="")))) 130 system(paste("gunzip ", attr(fe,'which'),'.gz',sep='')) 131 else if(any(fe <- fexists(paste(unix.file,".Z",sep="")))) 132 system(paste("uncompress ",attr(fe,'which'),'.Z',sep='')) 133 } 134 135 if(!any(fe <- fexists(unix.file))) { 136 stop(paste(sep = "", "Unix file, \"", 137 paste(unix.file,collapse=' '), 138 "\", does not exist")) 139 } else { 140 file.name <- attr(fe,'which') 141 if(!file.is.readable(file.name)) { 142 stop(paste(sep = "", 143 "You do not have read permission for Unix file, \"", 144 file.name, "\"")) # 22Oct00 145 } 146 } 147 148 cat("libname temp '", libraryName, "';\n", file = sasin, append = TRUE, 149 sep = "") 150 151 ## format.library should contain formats.sct containing user defined 152 ## formats used by this dataset. It must be present. 153 cat("libname library '", format.library, "';\n", file = sasin, 154 append = TRUE, sep = "") 155 cat("%sas_get(temp.", member, ",\n", 156 " ", sasout1, ",\n", 157 " ", sasout2, ",\n", 158 " ", sasout3, ",\n", 159 " ", sasout4, ",\n", 160 " dates=", dates., ",\n", 161 " vars=", varstring, ",\n", 162 " ifs=", ifs, ",\n", 163 " formats=", as.integer(formats), "\n,", 164 " specmiss=", as.integer(special.miss), ");\n", 165 file = sasin, append = TRUE, sep = "") 166 } 167 168 status <- system(paste(shQuote(sasprog), shQuote(sasin), "-log", 169 shQuote(log.file)), intern=FALSE) 170 ## 24nov03 added output=F 171 if(status != 0) { 172 if(!quiet && fexists(log.file)) fileShow(log.file) ## 4oct03 173 stop(paste("SAS job failed with status", status)) 174 } 175 ## 176 ## Read in the variable information 177 ## 178 if(!(fexists(sasout1) && fexists(sasout2))) { 179 if(!quiet) 180 fileShow(log.file) ## 4oct03 181 182 stop("SAS output files not found") 183 } 184 185 vars <- 186 scan(sasout1, list(name = "", type = 0, length = 0, 187 format = "", label = "", n = 0), 188 multi.line = FALSE, sep = "\022", 189 flush=TRUE, comment.char='', quote='') 190 ## Thanks Don MacQueen for scan fix for R 191 192 nvar <- length(vars$name) 193 if(nvar == 0) { 194 if(!quiet) 195 fileShow(log.file) ## 4oct03 196 197 stop("First SAS output is empty") 198 } 199 200 nrow <- vars$n[1] #n is the same for each variable 201 202 ## Read the data in 203 ## We try to be clever about the variable type. If SAS is character 204 ## use char of course. If is numeric and length >4, use double. If 205 ## numeric and length <4, use single. We could also use the format to 206 ## choose further, if it consists of a number followed by a "." 207 ## can we safely assume integer. 208 ## 209 type <- ifelse(vars$type == 2, "character(nrow)", 210 ifelse(force.single, ##28Mar01 211 "single(nrow)", "double(nrow)")) 212 ##BILL: I corrected the macro so the following isn't needed: 213 ## get rid of trailing blank on names 214 ## vars$name <- unix("sed 's/ $//'", vars$name) 215 inlist <- paste("\"", vars$name, "\"=", type, 216 sep = "", collapse = ", ") 217 218 inlist <- parse(text = paste("list(", inlist, ")")) 219 ## Inlist would now be the size of the final data structure, if I had 220 ## evaluated it. 221 222 ## Read the data 223 ds <- 224 scan(sasout2, eval(inlist), sep = "\022", multi.line = FALSE, 225 flush=TRUE, comment.char='', quote='') 226 227 if(length(ds) < nvariables) { 228 m <- variables[is.na(match(variables, names(ds)))] 229 if(length(m) > 0) { 230 warning(paste(length(m), 231 "requested variables did not exist:", 232 paste("\"", m, "\"", sep = "", collapse = " "), 233 "\n\t(use sas.contents())")) 234 } 235 } 236 237 format <- vars$format 238 format[format=='$'] <- ' ' # 1Mar00 239 label <- vars$label 240 name <- vars$name 241 esasout3 <- formats && fexists(sasout3) #added formats && 1/20/93 242 if(recode && !esasout3) recode <- FALSE 243 FORMATS <- NULL 244 245 if(formats && esasout3) { 246 FORMATS <- dget(sasout3) 247 if(length(FORMATS)==0) { 248 FORMATS <- NULL; 249 recode <- FALSE 250 } 251 } 252 253 smiss <- NULL 254 if(special.miss && fexists(sasout4)) 255 smiss <- 256 scan(sasout4, list(name="", code="", obs=integer(1)), 257 multi.line=FALSE, flush=TRUE, sep="\022", 258 comment.char='', quote='') 259 260 sasdateform <- c("date","mmddyy","yymmdd","ddmmyy","yyq","monyy", 261 "julian","qtr","weekdate","weekdatx","weekday","month") 262 dateform <- 263 list(as.name("ddmmmyy"),"m/d/y","y/m/d","d/m/y",as.name("ddmmmyy"), 264 "mon year",as.name("ddmmmyy"),"mon",as.name("ddmmmyy"), 265 as.name("ddmmmyy"), as.name("ddmmmyy"),"m") 266 267 sastimeform <- c("hhmm","hour","mmss","time") 268 timeform <- c("h:m","h","m:s","h:m:s") 269 sasdatetimeform <- c("datetime","tod") 270 datetimeform <- list(list(as.name("ddmmmyy"),"h:m:s"), c("m/d/y"," ")) 271 z <- "%02d%b%Y" 272 dateform4 <- 273 c(z,"%02m/%02d/%Y","%Y/%02m/%02d","%02d/%02m/%Y", z,"%02m %Y", 274 z,"%02m", z, z, z,"%02m") 275 276 timeform4 <- c("%02H:%02M","%02H","%02M:%02S","%02H:%02M:%02S") 277 datetimeform4 <- c("%02d%b%Y %02h:%02m:%02s","%02m/%02d/%Y") 278 279 ## Don MacQueen 280 days.to.adj <- as.numeric(difftime(ISOdate(1970,1,1,0,0,0) , 281 ISOdate(1960,1,1,0,0,0), 'days')) 282 secs.to.adj <- days.to.adj*24*60*60 283 284 for(i in 1:nvar) { 285 atr <- list() 286 dsi <- ds[[i]] 287 fname <- format[i] 288 rec <- FALSE 289 if(fname!=" ") { 290 ff <- fname 291 if(dates.=="sas" & (m <- match(fname,sasdateform,0)) >0) { 292 ##look for partial dates 293 dd <- dsi-floor(dsi) 294 ddn <- !is.na(dd) 295 if(any(ddn) && any(dd[ddn]!=0)) { 296 ll <- 1:length(dd) 297 atr$partial.date <- 298 list(month=ll[dd==.5],day=ll[dd==.25],both=ll[dd==.75]) 299 atr$imputed <- ll[dd!=0] 300 dsi <- floor(dsi) 301 } 302 dsi <- importConvertDateTime(dsi, 'date', 'sas', 303 form=dateform[m]) 304 305 if(length(atr$imputed)) 306 attr(dsi,'class') <- c("impute",attr(dsi,'class')) 307 308 ff <- NULL 309 } else { 310 if((m <- match(fname,sastimeform,0)) >0) { 311 dsi <- importConvertDateTime(dsi, 'time', 'sas', 312 form=timeform[m]) 313 ff <- NULL 314 } else if((m <- match(fname,sasdatetimeform,0))>0) { 315 dsi <- importConvertDateTime(dsi, 'datetime', 'sas', 316 form=datetimeform[m]) 317 318 ff <- NULL 319 } 320 } 321 322 atr$format <- ff 323 if(recode & length(g <- FORMATS[[fname]])) { 324 labs <- g$labels 325 if(!is.logical(recode)) { 326 labs <- if(recode==1) paste(g$values,":",labs,sep="") 327 else paste(labs,"(",g$values,")",sep="") 328 } 329 330 dsi <- factor(dsi, g$values, labs) 331 atr$sas.codes <- g$values 332 rec <- TRUE 333 } 334 } 335 336 if(data.frame.out && !rec && vars$type[i]==2 && 337 ((is.logical(as.is) && !as.is) || 338 (is.numeric(as.is) && length(unique(dsi)) < as.is*length(dsi)))) 339 dsi <- factor(dsi, exclude="") #exclude added 5Mar93 340 341 ## For data frames, char. var usually factors 342 if(label[i]!=" ") 343 label(dsi) <- label[i] #atr$label <- label[i] 344 345 if(length(smiss$name)) { 346 j <- smiss$name==name[i] 347 if(any(j)) { 348 atr$special.miss <- 349 list(codes=smiss$code[j],obs=smiss$obs[j]) 350 attr(dsi,'class') <- c("special.miss",attr(dsi,'class')) 351 } 352 } 353 354 if(!is.null(atr)) 355 attributes(dsi) <- c(attributes(dsi),atr) 356 357 if(missing(where)) 358 ds[[i]] <- dsi 359 else 360 assign(name[i], dsi, where=where) 361 } 362 363 if(!missing(where)) 364 return(structure(where, class="where")) 365 366 atr <- list() 367 368 if(missing(id)) { 369 if(data.frame.out) 370 atr$row.names <- as.character(1:nrow) 371 } else { 372 idname <- id 373 jj <- match(idname, names(ds), 0) 374 if(any(jj==0)) 375 stop(paste("id variable(s) not in dataset:", 376 paste(idname[jj==0],collapse=" "))) 377 378 if(length(idname)==1) { 379 id <- ds[[idname]] #Need since not use data.frame 380 } else { 381 id <- as.character(ds[[idname[1]]]) 382 for(jj in 2:length(idname)) 383 id <- paste(id, as.character(ds[[idname[jj]]])) 384 } 385 386 if(check.unique.id) { 387 dup <- duplicated(id) 388 if(any(dup)) 389 warning(paste("duplicate IDs:", 390 paste(id[dup], collapse=" "))) 391 } 392 393 if(data.frame.out) 394 atr$row.names <- as.character(id) 395 else atr$id <- id 396 } 397 398 if(!is.null(FORMATS)) 399 atr$formats <- FORMATS 400 401 if(data.frame.out) 402 atr$class <- "data.frame" 403 404 attributes(ds) <- c(attributes(ds),atr) 405 ds 406} 407 408importConvertDateTime <- 409 function(x, type=c('date','time','datetime'), 410 input=c('sas','spss','dataload'), form) { 411 type <- match.arg(type) 412 input <- match.arg(input) 413 414 if(input != 'sas' && type != 'date') 415 stop('only date variables are support for spss, dataload') 416 417 adjdays <- c(sas=3653, spss=141428, dataload=135080)[input] 418 ## 1970-1-1 minus 1960-1-1, 1582-10-14, or 1600-3-1 419 if(input=='spss') x <- x/86400 420 421 switch(type, 422 date = structure(x - adjdays, class='Date'), 423 time = { 424 ## Don MacQueen 3Apr02 425 z <- structure(x, class=c('POSIXt','POSIXct')) 426 f <- format(z, tz='GMT') 427 z <- as.POSIXct(format(z, tz='GMT'), tz='') 428 structure(z, class=c('timePOSIXt','POSIXt','POSIXct'))}, 429 datetime = { 430 chron((x - adjdays*86400)/86400, 431 out.format=c(dates='day mon year', times='h:m:s'))}) 432} 433 434 435## Don MacQueen 3Apr02 436## slightly modified copy of format.POSIXct() from R base 437format.timePOSIXt <- function (x, format = "%H:%M:%S", tz = "", 438 usetz = FALSE, ...) { 439 if (!inherits(x, c("timePOSIXt","POSIXct"))) stop("wrong class") 440 class(x) <- class(x)[-1] 441 structure(format.POSIXlt(as.POSIXlt(x, tz), format, usetz, ...), 442 names = names(x)) 443} 444 445print.timePOSIXt <- function(x, ...) print(format(x, ...)) 446 447 448##if(!.R.) { 449## Output format routine needed by chron for usual SAS date format 450ddmmmyy <- function(x) 451{ 452 y <- month.day.year(trunc(oldUnclass(x)), attr(x,"origin")) 453 yr <- y$year 454 m <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct", 455 "Nov","Dec")[y$month] 456 ifelse(yr<1900 | yr>=2000, paste(y$day,m,yr,sep=""), 457 paste(y$day,m,yr-1900,sep="")) 458} 459 460 461## Functions to handle special.miss class 462is.special.miss <- function(x, code) 463{ 464 sm <- attr(x, "special.miss") 465 if(!length(sm)) 466 return(rep(FALSE, length(x))) 467 468 if(missing(code)) { 469 z <- rep(FALSE, length(x)) 470 z[sm$obs] <- TRUE 471 } else { 472 z <- rep(FALSE, length(x)) 473 z[sm$obs[sm$codes==code]] <- TRUE 474 } 475 476 z 477} 478 479 480"[.special.miss" <- function(x, ..., drop=FALSE) 481{ 482 ats <- attributes(x) 483 ats$dimnames <- NULL 484 ats$dim <- NULL 485 ats$names <- NULL 486 attr(x,'class') <- NULL 487 y <- x[..., drop = drop] 488 if(length(y) == 0) 489 return(y) 490 491 k <- seq(along=x) 492 names(k) <- names(x) 493 k <- k[...] 494 attributes(y) <- c(attributes(y), ats) 495 smiss <- attr(y, "special.miss") 496 codes <- rep("ZZ",length(x)) 497 codes[smiss$obs] <- smiss$codes 498 codes <- codes[...] 499 which <- codes!="ZZ" 500 if(sum(which)) attr(y,"special.miss") <- 501 list(obs=seq(along=k)[codes!="ZZ"],codes=codes[codes!="ZZ"]) 502 else { 503 attr(y,"special.miss") <- NULL 504 attr(y,'class') <- attr(y,'class')[attr(y,'class') != "special.miss"] 505 if(length(attr(y,'class'))==0) 506 attr(y,'class') <- NULL 507 } 508 509 y 510} 511 512 513format.special.miss <- function(x, ...) 514{ 515 w <- 516 if(is.factor(x)) 517 as.character(x) 518 else { 519 cl <- attr(x,'class'); 520 cl <- cl[cl!="special.miss"] 521 if(length(cl)) { 522 attr(x,'class') <- cl; 523 format(x, ...) 524 } else format.default(x, ...) 525 } 526 527 sm <- attr(x, "special.miss") 528 names(w) <- names(x) 529 if(!length(sm)) 530 return(w) 531 532 w[sm$obs] <- sm$codes 533 attr(w,"label") <- attr(w,"special.miss") <- attr(w,"class") <- NULL 534 w 535} 536 537 538print.special.miss <- function(x, ...) 539{ 540 sm <- attr(x, "special.miss") 541 if(!length(sm)) { 542 print.default(x) 543 return(invisible()) 544 } 545 546 w <- format.special.miss(x) 547 print.default(w, quote=FALSE) 548 invisible() 549} 550 551 552sas.codes <- function(object) attr(object, "sas.codes") 553 554 555code.levels <- function(object) { 556 if(length(cod <- attr(object,"sas.codes"))) 557 levels(object) <- paste(cod,":",levels(object),sep="") 558 559 object 560} 561 562 563as.data.frame.special.miss <- function(x, row.names = NULL, optional = FALSE, ...) 564{ 565 nrows <- length(x) 566 if(is.null(row.names)) { 567 ## the next line is not needed for the 1993 version of data.class and is 568 ## included for compatibility with 1992 version 569 if(length(row.names <- names(x)) == nrows && 570 !any(duplicated(row.names))) { 571 } 572 else if(optional) 573 row.names <- character(nrows) 574 else row.names <- as.character(1:nrows) 575 } 576 577 value <- list(x) 578 if(!optional) 579 names(value) <- deparse(substitute(x))[[1]] 580 581 structure(value, row.names=row.names, class='data.frame') 582} 583 584 585## val{nval}=compress(value)||"" was =value 23mar04 586sas.get.macro <- 587 c("/* Macro sas_get (modified by F. Harrell 30Jan90, Bill Dunlap Dec90, FH Mar92,", 588 "\t\t\tFH Apr95 (extend LENGTH smiss))", 589 " Sets up for conversion of SAS dataset to S dataset.", 590 " Arguments:", "\tdataset - name of SAS dataset", 591 "\ttemp1\t- Name of temporary dataset to contain data dictionar (unquoted)", 592 "\t\t default=/tmp/file.1", 593 "\ttemp2\t- Name of temporary dataset to contain ASCII version of SAS", 594 "\t\t dataset (unquoted)", "\t\t default=/tmp/file.2", 595 "\ttemp3 - Name of temporary dataset to contain ASCII file with S", 596 "\t\t program to store format values and labels", 597 "\ttemp4 - Name of temporary dataset to contain ASCII file with", 598 "\t\t locations of special missing values", 599 "\tdates\t- SAS to store date variables in SAS format ( # days from 1/1/60)", 600 "\t\t (default)", 601 "\t\t- YEARFRAC to store as days from 1/1/1900, divided by 365.25", 602 "\t\t- YEARFRAC2 to store as year + fraction of current year", 603 "\t\t- YYMMDD to store as numeric YYMMDD", 604 "\tvars - list of variable in dataset that you want returned to S", 605 " (unquoted, separate variable names with spaces) If empty,", 606 " then return all variables.", 607 " ifs - sequence of SAS subsetting if statements, (unquoted,", 608 " separated by semicolons).", 609 "\tformats - 0 (default) - do not create file on temp3 containing S", 610 "\t\t statements to store format values and labels, 1 do create", 611 "\tspecmiss- 0 (default). Set to 1 to write a data file on temp4 with", 612 "\t\t the fields: variable name, special missing value code,", 613 "\t\t observation number", 614 " */", 615 "%macro sas_get(dataset, temp1, temp2, temp3, temp4, dates=SAS, vars=, ifs=, ", 616 "\tformats=0, specmiss=0);", 617 "OPTIONS NOFMTERR;", 618 "%IF %QUOTE(&temp1)= %THEN %LET temp1=/tmp/file.1;", 619 "%IF %QUOTE(&temp2)= %THEN %LET temp2=/tmp/file.2;", 620 "%IF %QUOTE(&temp3)= %THEN %LET temp3=/tmp/file.3;", 621 "%IF %QUOTE(&temp4)= %THEN %LET temp4=/tmp/file.4;", 622 ## Next line had %QUOTE(&ifs),1,\"'\" 31oct02 623 "%LET dates=%UPCASE(&dates);", "%LET ifs=%SCAN(%QUOTE(&ifs),1,'');", 624 "%LET _s_=_sav_;", 625 "/* BILL: Can these 2 subsets be combined into one pass of the data? -Frank*/", 626 "/* Subset by observation first */", "%IF %QUOTE(&ifs)^= %THEN %DO;", 627 " data _osub_ ;", " set &dataset ;", " &ifs ;", 628 " %LET dataset=_osub_ ;", " %END;", "/* Then subset by variable */", 629 "%IF &vars^= %THEN %DO;", " data _vsub_ ;", " set &dataset ;", 630 " keep &vars ;", " %LET dataset=_vsub_ ;", " %END;", 631 "proc contents data=&dataset out=&_s_(KEEP=name type length label format nobs ", 632 " varnum) noprint; ", "%IF &formats=1 %THEN %DO;", 633 " PROC FORMAT LIBRARY=LIBRARY CNTLOUT=f(KEEP=fmtname type start end label);", 634 " DATA f; SET f; RETAIN n 0; n+1; IF type=\"C\" THEN fmtname=\"$\"||fmtname;", 635 " PROC SORT DATA=f OUT=f(DROP=n); BY fmtname n; ", 636 " *Sort by n instead of start for numerics so 13 sorts after 2;", 637 " *Dont consider formats containing ANY range of values;", 638 " *Dont consider formats that dont have at least one non-missing (if", 639 " numeric) starting value. This gets rid of formats that are used", 640 " only to label special missing values;", 641 " DATA f2; SET f; BY fmtname; RETAIN anyrange 0 anynmiss 0;", 642 " IF FIRST.fmtname THEN DO;anyrange=0;anynmiss=0;END;", 643 " IF start^=end THEN anyrange=1;", 644 " IF TYPE=\"C\" THEN anynmiss=1; ", 645 " ELSE IF (start+0)>. THEN anynmiss=1;", 646 " IF LAST.fmtname & anynmiss & ^anyrange THEN OUTPUT; KEEP fmtname;", 647 " DATA f; MERGE f f2(IN=in2); BY fmtname; IF in2;", 648 " IF TYPE=\"N\" THEN DO; IF (start+0)>.; *S cannot handle special missings;", 649 " END;", " RENAME fmtname=format start=value; DROP end;", 650 " PROC SORT DATA=&_s_(KEEP=format) OUT=sform; BY format;", 651 " DATA sform; SET sform; BY format; IF LAST.format;", 652 " DATA f; MERGE sform(IN=in1) f(IN=in2); BY format; ", 653 " IF in1 & in2;", 654 " *This keeps formats ever used by any variable;", 655 " DATA _NULL_; SET f END=_eof_; BY format;", 656 " ARRAY val{*} $ 16 val1-val500; ARRAY lab{*} $ 40 lab1-lab500; ", 657 " RETAIN done 0 nform 0 nval 0 val1-val500 \" \" lab1-lab500 \" \" bk -1; ", 658 " FILE \"&temp3\" LRECL=4096;", " IF FIRST.format THEN DO;", 659 " IF ^done THEN PUT 'list(' @@; done=1;", 660 " nform=nform+1; nval=0;", 661 " format=TRANSLATE(format,\".abcdefghijklmnopqrstuvwxyz\",", 662 " \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");", 663 " IF nform=1 THEN PUT '\"' format +bk '\"=list(' @@;", 664 " ELSE PUT ', \"' format +bk '\"=list(' @@;", " END;", 665 " nval=nval+1; ", 666 " IF nval>500 THEN DO; ERROR \">500 format values not allowed\";ABORT ABEND;", 667 " END;", ' val{nval}=compress(value)||""; lab{nval}=label; ', 668 " IF LAST.format THEN DO;", " PUT \"values=c(\" @@; ", 669 " DO i=1 TO nval; IF i>1 THEN PUT \",\" @@;", 670 " IF type=\"N\" THEN PUT val{i} +bk @@;", 671 " ELSE PUT '\"' val{i} +bk '\"' @@;", " END;", 672 " PUT \"),labels=c(\" @@;", 673 " DO i=1 TO nval; IF i>1 THEN PUT \",\" @@;", 674 " PUT '\"' lab{i} +bk '\"' @@;", " END;", 675 " PUT \"))\";", " END;", 676 " IF _eof_ THEN PUT \")\";", " %END;", 677 "PROC SORT DATA=&_s_;BY varnum;", "data _null_;", " set &_s_ end=eof;", 678 " FILE \"&temp1\"; RETAIN _bk_ -1;", " if _n_ = 1 then do;", 679 "%IF &specmiss=0 %THEN %LET ofile=_NULL_; ", 680 "%ELSE %LET ofile=smiss(KEEP=vname val obs);", 681 " put \"data &ofile; set &dataset end=eof;\";", 682 " put ' file \"&temp2\" RECFM=D LRECL=4096;';", 683 " put \" retain __delim 18 _bk_ -1 obs 0; LENGTH _xx_ $ 20 obs 5;obs+1; \";", 684 "%IF &specmiss=1 %THEN %DO;", 685 " put \"LENGTH vname $ 8 val $ 1;\"; %END;", " end;", 686 " IF type=2 THEN DO;", " PUT 'FORMAT ' name ';' @;", 687 " PUT 'IF ' name '=\" \" THEN PUT __delim IB1. @;';", 688 "/* $char added F.H. 24Mar92, dropped +_bk_ before __delim */", 689 "/* $CHAR. removed FEH 2Aug92, added null FORMAT above, added back +_bk_ */", 690 " PUT 'ELSE PUT ' name '+_bk_ __delim IB1. @;';", " END;", 691 " ELSE DO; ", " PUT 'IF ' name '<=.Z THEN _xx_=\"NA\";' @;", 692 " PUT 'ELSE _xx_=LEFT(PUT(' @;", " format=UPCASE(format);", 693 " IF format=\"DATE\"|format=\"MMDDYY\"|format=\"YYMMDD\"|", 694 "format=\"DDMMYY\"|format=\"YYQ\"|format=\"MONYY\"|format=\"JULIAN\" THEN DO;", 695 " %IF &dates=SAS %THEN", " PUT name \",BEST18.)\";", 696 " %ELSE %IF &dates=YYMMDD %THEN", " PUT name \",YYMMDD6.)\";", 697 " %ELSE %IF &dates=YEARFRAC %THEN", 698 " PUT \"(\" name \"-MDY(1,1,1900))/365.25,7.3)\";", 699 " %ELSE %IF &dates=YEARFRAC2 %THEN %DO;", 700 " PUT \"YEAR(\" name \")-1900+(\" name \"-MDY(1,1,YEAR(\" name \")))/\" @;", 701 " PUT \"(MDY(12,31,YEAR(\" name \"))-MDY(1,1,YEAR(\" name \"))+1),7.3)\";", 702 " %END;", " ;", " END;\t", 703 " ELSE DO;PUT name \",BEST18.)\" @;END;", 704 " PUT '); PUT _xx_ +_bk_ __delim IB1. @;'; *Added +_bk_ 2Aug92;", 705 "%IF &specmiss=1 %THEN %DO;", 706 " put 'IF .A<=' name '<=.Z THEN DO;", 707 " vname=\"' name +_bk_ '\"; val=put(' name ',1.); OUTPUT; END;';", 708 " %END;", " END;", "if eof then PUT 'PUT; RUN;';", "run;", 709 "%include \"&temp1\";", "data _null_; set &_s_;", 710 " retain __delim 18 _bk_ -1; ", " file \"&temp1\" LRECL=4096;", 711 " name=TRANSLATE(name,\".abcdefghijklmnopqrstuvwxyz\",", 712 "\t\t \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");", 713 " format=TRANSLATE(format,\".abcdefghijklmnopqrstuvwxyz\",", 714 " \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");", 715 " put name +_bk_ __delim IB1. type +_bk_ __delim IB1. length +_bk_ __delim IB1.", 716 " format +_bk_ __delim IB1. label +_bk_ __delim IB1. nobs +_bk_ __delim IB1.;", 717 "run;", "%IF &specmiss=1 %THEN %DO;", 718 " PROC SORT DATA=smiss OUT=smiss;BY vname val obs;", 719 " DATA _NULL_; SET smiss;FILE \"&temp4\" RECFM=D LRECL=30;", 720 " RETAIN _bk_ -1 __delim 18;", 721 " vname=TRANSLATE(vname,\".abcdefghijklmnopqrstuvwxyz\",", 722 "\t\t \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");", 723 " PUT vname +_bk_ __delim IB1. val +_bk_ __delim IB1. obs +_bk_ __delim IB1.;", 724 " RUN;", " %END;", "%mend sas_get;") 725 726cleanup.import <- 727 function(obj, labels=NULL, lowernames=FALSE, 728 force.single=TRUE, force.numeric=TRUE, 729 rmnames=TRUE, 730 big=1e20, sasdict, 731 pr=prod(dimobj) > 5e5, 732 datevars=NULL, datetimevars=NULL, 733 dateformat='%F', fixdates=c('none','year'), 734 charfactor=FALSE) 735{ 736 fixdates <- match.arg(fixdates) 737 nam <- names(obj) 738 dimobj <- dim(obj) 739 nv <- length(nam) 740 741 if(!missing(sasdict)) 742 { 743 sasvname <- makeNames(sasdict$NAME) 744 if(any(w <- nam %nin% sasvname)) 745 stop(paste('The following variables are not in sasdict:', 746 paste(nam[w],collapse=' '))) 747 748 saslabel <- structure(as.character(sasdict$LABEL), 749 names=as.character(sasvname)) 750 labels <- saslabel[nam] 751 names(labels) <- NULL 752 } 753 754 if(length(labels) && length(labels) != dimobj[2]) 755 stop('length of labels does not match number of variables') 756 757 if(lowernames) 758 names(obj) <- casefold(nam) 759 760 if(pr) 761 cat(dimobj[2],'variables; Processing variable:') 762 763 for(i in 1:dimobj[2]) 764 { 765 if(pr) cat(i,'') 766 767 x <- obj[[i]]; 768 modif <- FALSE 769 if(length(dim(x))) 770 next 771 772 if(rmnames) 773 { 774 if(length(attr(x,'names'))) 775 { 776 attr(x,'names') <- NULL 777 modif <- TRUE 778 } else if(length(attr(x,'.Names'))) 779 { 780 attr(x,'.Names') <- NULL 781 modif <- TRUE 782 } 783 } 784 785 if(length(attr(x,'Csingle'))) { 786 attr(x,'Csingle') <- NULL 787 modif <- TRUE 788 } 789 790 if(length(c(datevars,datetimevars)) && 791 nam[i] %in% c(datevars,datetimevars) && 792 !all(is.na(x))) { 793 if(!(is.factor(x) || is.character(x))) 794 stop(paste('variable',nam[i], 795 'must be a factor or character variable for date conversion')) 796 797 x <- as.character(x) 798 ## trim leading and trailing white space 799 x <- sub('^[[:space:]]+','',sub('[[:space:]]+$','', x)) 800 xt <- NULL 801 if(nam[i] %in% datetimevars) { 802 xt <- gsub('.* ([0-9][0-9]:[0-9][0-9]:[0-9][0-9])','\\1',x) 803 xtnna <- setdiff(xt, c('',' ','00:00:00')) 804 if(!length(xtnna)) xt <- NULL 805 x <- gsub(' [0-9][0-9]:[0-9][0-9]:[0-9][0-9]','',x) 806 } 807 if(fixdates != 'none') { 808 if(dateformat %nin% c('%F','%y-%m-%d','%m/%d/%y','%m/%d/%Y')) 809 stop('fixdates only supported for dateformat %F %y-%m-%d %m/%d/%y %m/%d/%Y') 810 811 x <- switch(dateformat, 812 '%F' =gsub('^([0-9]{2})-([0-9]{1,2})-([0-9]{1,2})', '20\\1-\\2-\\3',x), 813 '%y-%m-%d'=gsub('^[0-9]{2}([0-9]{2})-([0-9]{1,2})-([0-9]{1,2})', '\\1-\\2-\\3',x), 814 '%m/%d/%y'=gsub('^([0-9]{1,2})/([0-9]{1,2})/[0-9]{2}([0-9]{2})', '\\1/\\2/\\3',x), 815 '%m/%d/%Y'=gsub('^([0-9]{1,2})/([0-9]{1,2})/([0-9]{2})$','\\1/\\2/20\\3',x)) 816 } 817 x <- if(length(xt)) { 818 require('chron') 819 cform <- if(dateformat=='%F') 'y-m-d' 820 else gsub('%','',tolower(dateformat)) 821 chron(x, xt, format=c(dates=cform,times='h:m:s')) 822 } 823 else as.Date(x, format=dateformat) 824 modif <- TRUE 825 } 826 827 if(length(labels)) { 828 label(x) <- labels[i] 829 modif <- TRUE 830 } 831 832 if(force.numeric && length(lev <- levels(x))) { 833 if(all.is.numeric(lev)) { 834 labx <- attr(x,'label') 835 x <- as.numeric(as.character(x)) 836 label(x) <- labx 837 modif <- TRUE 838 } 839 } 840 841 if(storage.mode(x) == 'double') { 842 xu <- oldUnclass(x) 843 j <- is.infinite(xu) | is.nan(xu) | abs(xu) > big 844 if(any(j,na.rm=TRUE)) { 845 x[j] <- NA 846 modif <- TRUE 847 if(pr) 848 cat('\n') 849 850 cat(sum(j,na.rm=TRUE),'infinite values set to NA for variable', 851 nam[i],'\n') 852 } 853 854 isdate <- testDateTime(x) 855 if(force.single && !isdate) { 856 allna <- all(is.na(x)) 857 if(allna) { 858 storage.mode(x) <- 'integer' 859 modif <- TRUE 860 } 861 862 if(!allna) { 863 notfractional <- !any(floor(x) != x, na.rm=TRUE) 864 if(max(abs(x),na.rm=TRUE) <= (2^31-1) && notfractional) { 865 storage.mode(x) <- 'integer' 866 modif <- TRUE 867 } 868 } 869 } 870 } 871 872 if(charfactor && is.character(x)) { 873 if(length(unique(x)) < .5*length(x)) { 874 x <- sub(' +$', '', x) # remove trailing blanks 875 x <- factor(x, exclude='') 876 modif <- TRUE 877 } 878 } 879 880 if(modif) obj[[i]] <- x 881 NULL 882 } 883 884 if(pr) cat('\n') 885 if(!missing(sasdict)) { 886 sasat <- sasdict[1,] 887 attributes(obj) <- c(attributes(obj), 888 sasds=as.character(sasat$MEMNAME), 889 sasdslabel=as.character(sasat$MEMLABEL)) 890 } 891 892 obj 893} 894 895upData <- function(object, ..., 896 rename=NULL, drop=NULL, 897 labels=NULL, units=NULL, levels=NULL, 898 force.single=TRUE, lowernames=FALSE, caplabels=FALSE, 899 moveUnits=FALSE, charfactor=FALSE) { 900 901 upfirst <- function(txt) gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE) 902 903 n <- nrow(object) 904 if(!length(n)) { 905 x <- object[[1]] 906 d <- dim(x) 907 n <- if(length(d)) d[1] 908 else length(x) 909 } 910 911 rnames <- row.names(object) 912 913 if(lowernames) 914 names(object) <- casefold(names(object)) 915 no <- names(object) 916 917 cat('Input object size:\t',object.size(object),'bytes;\t', 918 length(no),'variables\n') 919 920 ## The following is targeted at R workspaces exported from StatTransfer 921 al <- attr(object, 'var.labels') 922 if(length(al)) { 923 if(caplabels) al <- upfirst(al) 924 for(i in 1:length(no)) 925 if(al[i] != '') label(object[[i]]) <- al[i] 926 attr(object, 'var.labels') <- NULL 927 if(missing(force.single)) force.single <- FALSE 928 } else if(caplabels) { 929 for(i in 1:length(no)) 930 if(length(la <- attr(object[[i]], 'label'))) 931 attr(object[[i]], 'label') <- upfirst(la) 932 } 933 934 if(moveUnits) 935 for(i in 1:length(no)) { 936 z <- object[[i]] 937 lab <- attr(z,'label') 938 if(!length(lab) || length(attr(z,'units'))) 939 next 940 941 paren <- length(grep('\\(.*\\)',lab)) 942 brack <- length(grep('\\[.*\\]',lab)) 943 if(paren+brack == 0) 944 next 945 946 cat('Label for',no[i],'changed from',lab,'to ') 947 u <- if(paren)regexpr('\\(.*\\)',lab) 948 else regexpr('\\[.*\\]',lab) 949 950 len <- attr(u,'match.length') 951 un <- substring(lab, u+1, u+len-2) 952 lab <- substring(lab, 1, u-1) 953 if(substring(lab, nchar(lab), nchar(lab)) == ' ') 954 lab <- substring(lab, 1, nchar(lab)-1) # added 2nd char above 8jun03 955 956 cat(lab,'\n\tunits set to ',un,'\n',sep='') 957 attr(z,'label') <- lab 958 attr(z,'units') <- un 959 object[[i]] <- z 960 } 961 962 if(length(rename)) { 963 nr <- names(rename) 964 if(length(nr)==0 || any(nr=='')) 965 stop('the list or vector specified in rename must specify variable names') 966 967 for(i in 1:length(rename)) { 968 if(nr[i] %nin% no) 969 stop(paste('unknown variable name:',nr[i])) 970 971 cat('Renamed variable\t', nr[i], '\tto', rename[[i]], '\n') 972 } 973 974 no[match(nr, no)] <- unlist(rename) 975 names(object) <- no 976 } 977 978 z <- substitute(list(...)) 979 980 if(length(z) > 1) { 981 z <- z[-1] 982 vn <- names(z) 983 if(!length(vn) || any(vn=='')) 984 stop('variables must all have names') 985 986 for(i in 1:length(z)) { 987 v <- vn[i] 988 if(v %in% no) 989 cat('Modified variable\t',v,'\n') 990 else { 991 cat('Added variable\t\t', v,'\n') 992 no <- c(no, v) 993 } 994 995 x <- eval(z[[i]], object) 996 d <- dim(x) 997 lx <- if(length(d))d[1] 998 else length(x) 999 1000 if(lx != n) { 1001 if(lx == 1) 1002 warning(paste('length of ',v, 1003 ' is 1; will replicate this value.',sep='')) 1004 else { 1005 f <- find(v) 1006 if(length(f))cat('Variable',v,'found in', 1007 paste(f,collapse=' '),'\n') 1008 1009 stop(paste('length of ',v,' (',lx, ')\n', 1010 'does not match number of rows in object (', 1011 n,')',sep='')) 1012 } 1013 } 1014 1015 ## If x is factor and is all NA, user probably miscoded. Add 1016 ## msg. 1017 if(is.factor(x) && all(is.na(x))) 1018 warning(paste('Variable ',v,'is a factor with all values NA.\n', 1019 'Check that the second argument to factor() matched the original levels.\n', 1020 sep='')) 1021 1022 object[[v]] <- x 1023 } 1024 } 1025 1026 if(force.single) { 1027 sm <- sapply(object, storage.mode) 1028 if(any(sm=='double')) 1029 for(i in 1:length(sm)) { # 28Mar01 1030 if(sm[i]=='double') { 1031 x <- object[[i]] 1032 if(testDateTime(x)) 1033 next ## 31aug02 1034 1035 if(all(is.na(x))) 1036 storage.mode(object[[i]]) <- 'integer' 1037 else { 1038 notfractional <- !any(floor(x) != x, na.rm=TRUE) ## 28Mar01 1039 ## max(abs()) 22apr03 1040 if(notfractional && max(abs(x),na.rm=TRUE) <= (2^31-1)) 1041 storage.mode(object[[i]]) <- 'integer' 1042 } 1043 } 1044 } 1045 } 1046 1047 if(charfactor) { 1048 g <- function(z) { 1049 if(!is.character(z)) return(FALSE) 1050 length(unique(z)) < .5*length(z) 1051 } 1052 mfact <- sapply(object, g) 1053 if(any(mfact)) 1054 for(i in (1:length(mfact))[mfact]) { 1055 x <- sub(' +$', '', object[[i]]) # remove trailing blanks 1056 object[[i]] <- factor(x, exclude='') 1057 } 1058 } 1059 1060 if(length(drop)) { 1061 if(length(drop)==1) 1062 cat('Dropped variable\t',drop,'\n') 1063 else 1064 cat('Dropped variables\t',paste(drop,collapse=','),'\n') 1065 1066 s <- drop %nin% no 1067 if(any(s)) 1068 warning(paste('The following variables in drop= are not in object:', 1069 paste(drop[s],collapse=' '))) 1070 1071 no <- no[no %nin% drop] 1072 object <- object[no] 1073 } 1074 1075 if(length(levels)) { 1076 if(!is.list(levels)) 1077 stop('levels must be a list') 1078 1079 nl <- names(levels) 1080 s <- nl %nin% no 1081 if(any(s)) { 1082 warning(paste('The following variables in levels= are not in object:', 1083 paste(nl[s],collapse=' '))) 1084 nl <- nl[!s] 1085 } 1086 1087 for(n in nl) { 1088 if(!is.factor(object[[n]])) 1089 object[[n]] <- as.factor(object[[n]]) 1090 1091 levels(object[[n]]) <- levels[[n]] 1092 ## levels[[nn]] will usually be a list; S+ invokes merge.levels 1093 } 1094 } 1095 1096 if(length(labels)) { 1097 nl <- names(labels) 1098 if(!length(nl)) stop('elements of labels were unnamed') 1099 s <- nl %nin% no 1100 if(any(s)) { 1101 warning(paste('The following variables in labels= are not in object:', 1102 paste(nl[s], collapse=' '))) 1103 nl <- nl[!s] 1104 } 1105 1106 for(n in nl) 1107 label(object[[n]]) <- labels[[n]] 1108 } 1109 1110 if(length(units)) { 1111 ##if(!is.list(units))stop('units must be a list') 1112 nu <- names(units) 1113 s <- nu %nin% no 1114 if(any(s)) { 1115 warning(paste('The following variables in units= are not in object:', 1116 paste(nu[s], collapse=' '))) 1117 nu <- nu[!s] 1118 } 1119 for(n in nu) 1120 attr(object[[n]],'units') <- units[[n]] 1121 } 1122 1123 cat('New object size:\t',object.size(object),'bytes;\t', 1124 length(no),'variables\n') 1125 ## if(.R.) object <- structure(object, class='data.frame', row.names=rnames) 1126 object 1127 } 1128 1129dataframeReduce <- function(data, fracmiss=1, maxlevels=NULL, 1130 minprev=0, pr=TRUE) 1131 { 1132 g <- function(x, fracmiss, maxlevels, minprev) 1133 { 1134 if(is.matrix(x)) 1135 { 1136 f <- mean(is.na(x %*% rep(1,ncol(x)))) 1137 return(if(f > fracmiss) 1138 paste('fraction missing>',fracmiss,sep='') else '') 1139 } 1140 h <- function(a, b) 1141 if(a=='') b else if(b=='') a else paste(a, b, sep=';') 1142 f <- mean(is.na(x)) 1143 x <- x[!is.na(x)] 1144 n <- length(x) 1145 r <- if(f > fracmiss) 1146 paste('fraction missing>',fracmiss,sep='') else '' 1147 if(is.character(x)) x <- factor(x) 1148 if(length(maxlevels) && is.category(x) && 1149 length(levels(x)) > maxlevels) 1150 return(h(r, paste('categories>',maxlevels,sep=''))) 1151 s <- '' 1152 if(is.category(x) || length(unique(x))==2) 1153 { 1154 tab <- table(x) 1155 if((min(tab) / n) < minprev) 1156 { 1157 if(is.category(x)) 1158 { 1159 x <- combine.levels(x, minlev=minprev) 1160 s <- 'grouped categories' 1161 if(length(levels(x)) < 2) 1162 s <- paste('prevalence<', minprev, sep='') 1163 } 1164 else s <- paste('prevalence<', minprev, sep='') 1165 } 1166 } 1167 h(r, s) 1168 } 1169 h <- sapply(data, g, fracmiss, maxlevels, minprev) 1170 if(all(h=='')) return(data) 1171 if(pr) 1172 { 1173 cat('\nVariables Removed or Modified\n\n') 1174 print(data.frame(Variable=names(data)[h!=''], 1175 Reason=h[h!=''], row.names=NULL, check.names=FALSE)) 1176 cat('\n') 1177 } 1178 s <- h=='grouped categories' 1179 if(any(s)) for(i in which(s)) 1180 data[[i]] <- combine.levels(data[[i]], minlev=minprev) 1181 if(any(h != '' & !s)) data <- data[h=='' | s] 1182 data 1183 } 1184 1185spss.get <- function(file, lowernames=FALSE, 1186 datevars=NULL, 1187 use.value.labels=TRUE, 1188 to.data.frame=TRUE, 1189 max.value.labels=Inf, 1190 force.single=TRUE, allow=NULL, charfactor=FALSE) 1191 { 1192 require('foreign') 1193 if(length(grep('http://', file))) { 1194 tf <- tempfile() 1195 download.file(file, tf, mode='wb', quiet=TRUE) 1196 file <- tf 1197 } 1198 1199 w <- read.spss(file, use.value.labels=use.value.labels, 1200 to.data.frame=to.data.frame, 1201 max.value.labels=max.value.labels) 1202 1203 a <- attributes(w) 1204 vl <- a$variable.labels 1205 nam <- a$names 1206 nam <- makeNames(a$names, unique=TRUE, allow=allow) 1207 if(lowernames) nam <- casefold(nam) 1208 names(w) <- nam 1209 1210 lnam <- names(vl) 1211 if(length(vl)) 1212 for(i in 1:length(vl)) { 1213 n <- lnam[i] 1214 lab <- vl[i] 1215 if(lab != '' && lab != n) label(w[[i]]) <- lab 1216 } 1217 1218 attr(w, 'variable.labels') <- NULL 1219 if(force.single || length(datevars) || charfactor) 1220 for(v in nam) { 1221 x <- w[[v]] 1222 changed <- FALSE 1223 if(v %in% datevars) { 1224 x <- importConvertDateTime(x, 'date', 'spss') 1225 changed <- TRUE 1226 } else if(all(is.na(x))) { 1227 storage.mode(x) <- 'integer' 1228 changed <- TRUE 1229 } else if(!(is.factor(x) || is.character(x))) { 1230 if(all(is.na(x))) { 1231 storage.mode(x) <- 'integer' 1232 changed <- TRUE 1233 } else if(max(abs(x),na.rm=TRUE) <= (2^31-1) && 1234 all(floor(x) == x, na.rm=TRUE)) { 1235 storage.mode(x) <- 'integer' 1236 changed <- TRUE 1237 } 1238 } else if(charfactor && is.character(x)) { 1239 if(length(unique(x)) < .5*length(x)) 1240 { 1241 x <- sub(' +$', '', x) # remove trailing blanks 1242 x <- factor(x, exclude='') 1243 changed <- TRUE 1244 } 1245 } 1246 if(changed) w[[v]] <- x 1247 } 1248 1249 w 1250 } 1251 1252 sasxport.get <- function(file, force.single=TRUE, 1253 method=c('read.xport','dataload','csv'), 1254 formats=NULL, allow=NULL, out=NULL, 1255 keep=NULL, drop=NULL, as.is=0.5, FUN=NULL) 1256 { 1257 method <- match.arg(method) 1258 if(length(out) && method!='csv') 1259 stop('out only applies to method="csv"') 1260 1261 if(method != 'csv') 1262 require('foreign') || stop('foreign package is not installed') 1263 1264 rootsoftware <- if(method=='dataload')'dataload' 1265 else 'sas' 1266 1267 sasdateform <- 1268 toupper(c("date","mmddyy","yymmdd","ddmmyy","yyq","monyy", 1269 "julian","qtr","weekdate","weekdatx","weekday","month")) 1270 sastimeform <- toupper(c("hhmm","hour","mmss","time")) 1271 sasdatetimeform <- toupper(c("datetime","tod")) 1272 1273 if(length(grep('http://', file))) { 1274 tf <- tempfile() 1275 download.file(file, tf, mode='wb', quiet=TRUE) 1276 file <- tf 1277 } 1278 1279 dsinfo <- 1280 if(method == 'csv') lookupSASContents(file) 1281 else lookup.xport(file) 1282 1283 whichds <- 1284 if(length(keep)) 1285 keep 1286 else 1287 setdiff(names(dsinfo), c(drop,'_CONTENTS_','_contents_')) 1288 1289 ds <- switch(method, 1290 read.xport= read.xport(file), 1291 dataload = read.xportDataload(file, whichds), 1292 csv = if(!length(out)) 1293 readSAScsv(file, dsinfo, whichds)) 1294 1295 if(method=='read.xport' && (length(keep) | length(drop))) 1296 ds <- ds[whichds] 1297 1298 ## PROC FORMAT CNTLOUT= dataset present? 1299 fds <- NULL 1300 if(!length(formats)) { 1301 fds <- sapply(dsinfo, function(x) 1302 all(c('FMTNAME','START','END','MIN','MAX','FUZZ') 1303 %in% x$name)) 1304 fds <- names(fds)[fds] 1305 if(length(fds) > 1) { 1306 warning('transport file contains more than one PROC FORMAT CNTLOUT= dataset; using only the first') 1307 fds <- fds[1] 1308 } 1309 } 1310 1311 finfo <- NULL 1312 if(length(formats) || length(fds)) { 1313 finfo <- 1314 if(length(formats)) 1315 formats 1316 else if(length(out)) 1317 readSAScsv(file, dsinfo, fds) 1318 else ds[[fds]] 1319 1320 ## Remove leading $ from char format names 1321 ## fmtname <- sub('^\\$','',as.character(finfo$FMTNAME)) 1322 fmtname <- as.character(finfo$FMTNAME) 1323 finfo <- split(finfo[c('START','END','LABEL')], fmtname) 1324 finfo <- lapply(finfo, 1325 function(f) 1326 { 1327 rb <- function(a) 1328 { # remove leading + trailing blanks 1329 a <- sub('[[:space:]]+$', '', as.character(a)) 1330 sub('^[[:space:]]+', '', a) 1331 } 1332 1333 st <- rb(f$START) 1334 en <- rb(f$END) 1335 lab <- rb(f$LABEL) 1336 ##j <- is.na(st) | is.na(en) 1337 ## st %in% c('','.','NA') | en %in% c('','.','NA') 1338 j <- is.na(st) | is.na(en) | st == '' | en == '' 1339 if(any(j)) { 1340 warning('NA in code in FORMAT definition; removed') 1341 st <- st[!j]; en <- en[!j]; lab <- lab[!j] 1342 } 1343 1344 if(!all(st==en)) 1345 return(NULL) 1346 1347 list(value = all.is.numeric(st, 'vector'), 1348 label = lab) 1349 }) 1350 } 1351 1352 ## Number of non-format datasets 1353 nods <- length(whichds) 1354 nds <- nods - (length(formats) == 0 && length(finfo) > 0) 1355 which.regular <- setdiff(whichds, fds) 1356 dsn <- tolower(which.regular) 1357 1358 if((nds > 1) && !length(out)) { 1359 res <- vector('list', nds) 1360 names(res) <- gsub('_','.',dsn) 1361 } 1362 1363 if(length(FUN)) { 1364 funout <- vector('list', length(dsn)) 1365 names(funout) <- gsub('_','.',dsn) 1366 } 1367 possiblyConvertChar <- if(method=='read.xport') 1368 (is.logical(as.is) && as.is) || 1369 (is.numeric(as.is) && as.is < 1) else 1370 (is.logical(as.is) && !as.is) || 1371 (is.numeric(as.is) && as.is > 0) 1372 ## reverse logic because read.xport always converts characters to factors 1373 j <- 0 1374 for(k in which.regular) { 1375 j <- j + 1 1376 cat('Processing SAS dataset', k, '\t ') 1377 w <- 1378 if(length(out)) 1379 readSAScsv(file, dsinfo, k) 1380 else if(nods==1) 1381 ds 1382 else ds[[k]] 1383 1384 cat('.') 1385 if(!length(w)) { 1386 cat('Empty dataset', k, 'ignored\n') 1387 next 1388 } 1389 1390 nam <- tolower(makeNames(names(w), allow=allow)) 1391 names(w) <- nam 1392 dinfo <- dsinfo[[k]] 1393 fmt <- sub('^\\$','',dinfo$format) 1394 lab <- dinfo$label 1395 ndinfo <- tolower(makeNames(dinfo$name, allow=allow)) 1396 names(lab) <- names(fmt) <- ndinfo 1397 for(i in 1:length(w)) { 1398 changed <- FALSE 1399 x <- w[[i]] 1400 fi <- fmt[nam[i]]; names(fi) <- NULL 1401 if(fi != '' && length(finfo) && (fi %in% names(finfo))) { 1402 f <- finfo[[fi]] 1403 if(length(f)) { ## may be NULL because had a range in format 1404 x <- factor(x, f$value, f$label) 1405 attr(x, 'format') <- fi 1406 changed <- TRUE 1407 } 1408 } 1409 if(is.numeric(x)) { 1410 if(fi %in% sasdateform) { 1411 x <- importConvertDateTime(x, 'date', rootsoftware) 1412 changed <- TRUE 1413 } else if(fi %in% sastimeform) { 1414 x <- importConvertDateTime(x, 'time', rootsoftware) 1415 changed <- TRUE 1416 } else if(fi %in% sasdatetimeform) { 1417 x <- importConvertDateTime(x, 'datetime', rootsoftware) 1418 changed <- TRUE 1419 } else if(force.single) { 1420 if(all(is.na(x))) { 1421 storage.mode(x) <- 'integer' 1422 changed <- TRUE 1423 } else if(max(abs(x),na.rm=TRUE) <= (2^31-1) && 1424 all(floor(x) == x, na.rm=TRUE)) { 1425 storage.mode(x) <- 'integer' 1426 changed <- TRUE 1427 } 1428 } 1429 } else if(method=='read.xport' && possiblyConvertChar && is.factor(x)) { 1430 if((is.logical(as.is) && as.is) || 1431 (is.numeric(as.is) && length(unique(x)) >= as.is*length(x))) { 1432 x <- as.character(x) 1433 changed <- TRUE 1434 } 1435 } else if(possiblyConvertChar && is.character(x)) { 1436 if((is.logical(as.is) && !as.is) || 1437 (is.numeric(as.is) && length(unique(x)) < as.is*length(x))) { 1438 x <- factor(x, exclude='') 1439 changed <- TRUE 1440 } 1441 } 1442 1443 lz <- lab[nam[i]] 1444 if(lz != '') { 1445 names(lz) <- NULL 1446 label(x) <- lz 1447 changed <- TRUE 1448 } 1449 1450 if(changed) 1451 w[[i]] <- x 1452 } 1453 1454 cat('.\n') 1455 if(length(out)) { 1456 nam <- gsub('_','.',dsn[j]) 1457 assign(nam, w) 1458 ## ugly, but a way to get actual data frame name into first 1459 ## argument of save( ) 1460 eval(parse(text=paste('save(',nam,', file="', 1461 paste(out, '/', nam,'.rda',sep=''), 1462 '", compress=TRUE)',sep=''))) 1463 if(length(FUN) && length(w)) 1464 funout[[nam]] <- FUN(w) 1465 1466 remove(nam) 1467 } else if(nds > 1) 1468 res[[j]] <- w 1469 } 1470 1471 if(length(out)) { 1472 names(dsinfo) <- gsub('_','.',tolower(names(dsinfo))) 1473 if(length(FUN)) 1474 attr(dsinfo, 'FUN') <- funout 1475 1476 invisible(dsinfo) 1477 } else if(nds > 1) 1478 res 1479 else w 1480 } 1481 1482 ## Use dataload program to create a structure like read.xport does 1483 read.xportDataload <- function(file, dsnames) { 1484 outf <- substring(tempfile(tmpdir=''),2) 1485 file.copy(file, paste(tempdir(),outf,sep='/')) 1486 curwd <- getwd() 1487 on.exit(setwd(curwd)) 1488 setwd(tempdir()) 1489 n <- length(dsnames) 1490 w <- vector('list', n); names(w) <- dsnames 1491 for(a in dsnames) { 1492 status <- system(paste('dataload', outf, 'zzzz.rda', a), 1493 intern=FALSE) 1494 if(status==0) { 1495 load('zzzz.rda') 1496 names(zzzz) <- makeNames(names(zzzz)) 1497 w[[a]] <- zzzz 1498 } 1499 } 1500 1501 w 1502 } 1503 1504 ## Read _contents_.csv and store it like lookup.xport output 1505 lookupSASContents <- function(sasdir) 1506 { 1507 w <- read.csv(paste(sasdir,'_contents_.csv',sep='/'), as.is=TRUE) 1508 z <- tapply(w$NOBS, w$MEMNAME, function(x)x[1]) 1509 if(any(z == 0)) { 1510 cat('\nDatasets with 0 observations ignored:\n') 1511 print(names(z)[z == 0], quote=FALSE) 1512 w <- subset(w, NOBS > 0) 1513 } 1514 1515 w$TYPE <- ifelse(w$TYPE==1, 'numeric', 'character') 1516 names(w) <- tolower(names(w)) 1517 unclass(split(subset(w,select=-c(memname,memlabel)), w$memname)) 1518 } 1519 1520 ## Read all SAS csv export files and store in a list 1521 readSAScsv <- function(sasdir, dsinfo, dsnames=names(dsinfo)) { 1522 sasnobs <- sapply(dsinfo, function(x)x$nobs[1]) 1523 multi <- length(dsnames) > 1 1524 if(multi) { 1525 w <- vector('list', length(dsnames)) 1526 names(w) <- dsnames 1527 } 1528 1529 for(a in dsnames) { 1530 z <- read.csv(paste(sasdir,'/',a,'.csv', sep=''), 1531 as.is=TRUE, blank.lines.skip=FALSE, 1532 comment.char="") 1533 1534 importedLength <- length(z[[1]]) 1535 if(importedLength != sasnobs[a]) 1536 cat('\nError: NOBS reported by SAS (',sasnobs[a],') for dataset ', 1537 a,' is not the same as imported length (', importedLength, 1538 ')\n', sep='') 1539 1540 if(multi) 1541 w[[a]] <- z 1542 } 1543 1544 if(multi) 1545 w 1546 else z 1547 } 1548 1549 1550 1551csv.get <- function(file, lowernames=FALSE, datevars=NULL, datetimevars=NULL, 1552 dateformat='%F', fixdates=c('none','year'), 1553 comment.char = "", autodates=TRUE, allow=NULL, 1554 charfactor=FALSE, 1555 sep=',', skip=0, vnames=NULL, labels=NULL, ...){ 1556 fixdates <- match.arg(fixdates) 1557 if(length(vnames)) 1558 vnames <- scan(file, what=character(0), skip=vnames-1, nlines=1, 1559 sep=sep, quiet=TRUE) 1560 if(length(labels)) 1561 labels <- scan(file, what=character(0), skip=labels-1, nlines=1, 1562 sep=sep, quiet=TRUE) 1563 1564 w <- if(length(vnames)) 1565 read.csv(file, check.names=FALSE, comment.char=comment.char, 1566 header=FALSE, col.names=vnames, skip=skip, sep=sep, ...) 1567 else read.csv(file, check.names=FALSE, comment.char=comment.char, 1568 sep=sep, skip=skip, ...) 1569 n <- nam <- names(w) 1570 m <- makeNames(n, unique=TRUE, allow=allow) 1571 if(length(labels)) n <- labels 1572 if(lowernames) 1573 m <- casefold(m) 1574 1575 changed <- any(m != nam) 1576 if(changed) 1577 names(w) <- m 1578 1579 if(autodates) { 1580 tmp <- w 1581 names(tmp) <- NULL 1582 1583 for(i in 1:length(tmp)) { 1584 if(! is.character(tmp[[1]])) 1585 next 1586 } 1587 } 1588 cleanup.import(w, 1589 labels=if(length(labels))labels else if(changed)n else NULL, 1590 datevars=datevars, datetimevars=datetimevars, 1591 dateformat=dateformat, 1592 fixdates=fixdates, charfactor=charfactor) 1593} 1594 1595 1596sasdsLabels <- function(file) 1597{ 1598 w <- scan(file, sep='\n', what='', quiet=TRUE) 1599 i <- grep('Data Set Name:', w) 1600 if(!length(i)) 1601 return(NULL) 1602 1603 n <- tolower(sub('.*\\.([A-Z0-9\\_]*)[[:space:]]+.*','\\1',w[i])) 1604 w <- gsub('\t','',w) 1605 labs <- ifelse(nchar(w[i-1])==0,w[i-2],w[i-1]) 1606 names(labs) <- n 1607 labs 1608} 1609