1 use std::io;
2
3 use csv;
4 use rand::Rng;
5
6 use CliResult;
7 use config::{Config, Delimiter};
8 use index::Indexed;
9 use util;
10
11 static USAGE: &'static str = "
12 Randomly samples CSV data uniformly using memory proportional to the size of
13 the sample.
14
15 When an index is present, this command will use random indexing if the sample
16 size is less than 10% of the total number of records. This allows for efficient
17 sampling such that the entire CSV file is not parsed.
18
19 This command is intended to provide a means to sample from a CSV data set that
20 is too big to fit into memory (for example, for use with commands like 'xsv
21 frequency' or 'xsv stats'). It will however visit every CSV record exactly
22 once, which is necessary to provide a uniform random sample. If you wish to
23 limit the number of records visited, use the 'xsv slice' command to pipe into
24 'xsv sample'.
25
26 Usage:
27 xsv sample [options] <sample-size> [<input>]
28 xsv sample --help
29
30 Common options:
31 -h, --help Display this message
32 -o, --output <file> Write output to <file> instead of stdout.
33 -n, --no-headers When set, the first row will be consider as part of
34 the population to sample from. (When not set, the
35 first row is the header row and will always appear
36 in the output.)
37 -d, --delimiter <arg> The field delimiter for reading CSV data.
38 Must be a single character. (default: ,)
39 ";
40
41 #[derive(Deserialize)]
42 struct Args {
43 arg_input: Option<String>,
44 arg_sample_size: u64,
45 flag_output: Option<String>,
46 flag_no_headers: bool,
47 flag_delimiter: Option<Delimiter>,
48 }
49
run(argv: &[&str]) -> CliResult<()>50 pub fn run(argv: &[&str]) -> CliResult<()> {
51 let args: Args = util::get_args(USAGE, argv)?;
52 let rconfig = Config::new(&args.arg_input)
53 .delimiter(args.flag_delimiter)
54 .no_headers(args.flag_no_headers);
55 let sample_size = args.arg_sample_size;
56
57 let mut wtr = Config::new(&args.flag_output).writer()?;
58 let sampled = match rconfig.indexed()? {
59 Some(mut idx) => {
60 if do_random_access(sample_size, idx.count()) {
61 rconfig.write_headers(&mut *idx, &mut wtr)?;
62 sample_random_access(&mut idx, sample_size)?
63 } else {
64 let mut rdr = rconfig.reader()?;
65 rconfig.write_headers(&mut rdr, &mut wtr)?;
66 sample_reservoir(&mut rdr, sample_size)?
67 }
68 }
69 _ => {
70 let mut rdr = rconfig.reader()?;
71 rconfig.write_headers(&mut rdr, &mut wtr)?;
72 sample_reservoir(&mut rdr, sample_size)?
73 }
74 };
75 for row in sampled.into_iter() {
76 wtr.write_byte_record(&row)?;
77 }
78 Ok(wtr.flush()?)
79 }
80
sample_random_access<R, I>( idx: &mut Indexed<R, I>, sample_size: u64, ) -> CliResult<Vec<csv::ByteRecord>> where R: io::Read + io::Seek, I: io::Read + io::Seek81 fn sample_random_access<R, I>(
82 idx: &mut Indexed<R, I>,
83 sample_size: u64,
84 ) -> CliResult<Vec<csv::ByteRecord>>
85 where R: io::Read + io::Seek, I: io::Read + io::Seek
86 {
87 let mut all_indices = (0..idx.count()).collect::<Vec<_>>();
88 let mut rng = ::rand::thread_rng();
89 rng.shuffle(&mut *all_indices);
90
91 let mut sampled = Vec::with_capacity(sample_size as usize);
92 for i in all_indices.into_iter().take(sample_size as usize) {
93 idx.seek(i)?;
94 sampled.push(idx.byte_records().next().unwrap()?);
95 }
96 Ok(sampled)
97 }
98
sample_reservoir<R: io::Read>( rdr: &mut csv::Reader<R>, sample_size: u64, ) -> CliResult<Vec<csv::ByteRecord>>99 fn sample_reservoir<R: io::Read>(
100 rdr: &mut csv::Reader<R>,
101 sample_size: u64,
102 ) -> CliResult<Vec<csv::ByteRecord>> {
103 // The following algorithm has been adapted from:
104 // http://en.wikipedia.org/wiki/Reservoir_sampling
105 let mut reservoir = Vec::with_capacity(sample_size as usize);
106 let mut records = rdr.byte_records().enumerate();
107 for (_, row) in records.by_ref().take(reservoir.capacity()) {
108 reservoir.push(row?);
109 }
110
111 // Now do the sampling.
112 let mut rng = ::rand::thread_rng();
113 for (i, row) in records {
114 let random = rng.gen_range(0, i+1);
115 if random < sample_size as usize {
116 reservoir[random] = row?;
117 }
118 }
119 Ok(reservoir)
120 }
121
do_random_access(sample_size: u64, total: u64) -> bool122 fn do_random_access(sample_size: u64, total: u64) -> bool {
123 sample_size <= (total / 10)
124 }
125