1package Crypt::PassGen;
2
3=head1 NAME
4
5Crypt::PassGen - Generate a random password that looks like a real word
6
7=head1 SYNOPSIS
8
9  use Crypt::PassGen qw/ passgen /;
10
11  @passwords = passgen( NWORDS => 10, NLETT => 8 );
12
13=head1 DESCRIPTION
14
15This module provides a single command for generating random password
16that is close enough to a real word that it is easy to remember.
17It does this by using the frequency of letter combinations in
18a language (the frequency table is generated during installation
19although multiple tables can be generated and used for different
20languages). The frequency table contains the probability that
21a word will start with a specific letter or 2 letter combination
22and then the frequency of 3 letter combinations.
23
24This module should not be used for high security applications
25(such as user accounts) since it returns passwords that are not
26mixed case, have no punctuation and no letters. This word can be
27used as a basis for a more secure password.
28
29The language of the password depends on the language used to construct
30the frequency table.
31
32=cut
33
34use integer;
35use strict;
36use Storable qw/ nstore retrieve /;
37use File::Spec;
38use Config;
39use vars qw/ $VERSION @ISA @EXPORT_OK $DEFAULT_DICT $DEFAULT_FREQFILE
40  $ERRSTR
41  /;
42
43$VERSION = '0.06';
44
45use constant MAXN => 32000;
46require Exporter;
47@ISA = qw( Exporter );
48@EXPORT_OK = qw( passgen ingest );
49
50# Default input dictionary and frequency file
51# The frequency file should be stored in the same place as this
52# module [use Config]
53
54$DEFAULT_DICT = '/usr/dict/words';  # Unix specific
55
56$DEFAULT_FREQFILE = File::Spec->catfile($Config{installsitelib},
57					"Crypt",
58					"PassGenWordFreq.dat");
59
60# This is the cache of frequency data to prevent us going to
61# disk each time passgen() is called. This effectively means
62# that calling passgen() 100 times is almost as fast as calling
63# passgen once for 100 passwords.
64
65my %CACHE;
66
67# Set up a hash with a lookup table to translate a character to
68# a position in an array
69
70my %letters = (
71    A => 0,
72    B => 1,
73    C => 2,
74    D => 3,
75    E => 4,
76    F => 5,
77    G => 6,
78    H => 7,
79    I => 8,
80    J => 9,
81    K => 10,
82    L => 11,
83    M => 12,
84    N => 13,
85    O => 14,
86    P => 15,
87    Q => 16,
88    R => 17,
89    S => 18,
90    T => 19,
91    U => 20,
92    V => 21,
93    W => 22,
94    X => 23,
95    Y => 24,
96    Z => 25
97);
98
99# ...and generate the inverse lookup table so that we can go from
100# a position to a letter
101
102my %revlett;
103foreach (keys %letters) {
104  $revlett{ $letters{$_} } = $_;
105}
106
107=head1 FUNCTIONS
108
109The following functions are provided:
110
111=over 4
112
113=item B<ingest>
114
115This function is used to create a frequency table to be used later
116by C<passgen>. This routine is run during the initial install of the
117module so that at least one frequency table is available.
118
119This function reads a file and for each word that is found (ignoring
120any with non-alphabet characters) notes the starting letter, the
121second letter and each combination of 3 letters. Once the file is read
122the resultant arrays then contain the relative occurence of each letter
123combination. The frequency table will vary depending on the language
124of the input file.
125
126  ingest( DICT   => '/usr/dict/words',
127          FILE   => 'wordfreq.dat',
128          APPEND => 0)
129
130The input hash can contain keys C<DICT>, C<FILE> and C<APPEND>
131with the above defaults. All arguments are optional. If C<APPEND>
132is true the frequency table from the input dictionary will be
133appended to an existing table (if it exists).
134
135Returns 1 if successful and 0 otherwise. On error, the reason
136is stored in $Crypt::PassGen::ERRSTR.
137
138A default frequency file is provided for C<passgen> as part of the
139installation.  This routine is only required to either extend or
140replace the default value.
141
142=cut
143
144sub ingest {
145
146  my %defaults = (
147		  DICT => $DEFAULT_DICT,
148		  FILE => $DEFAULT_FREQFILE,
149		  APPEND => 0,
150		  );
151
152  my %opts = ( %defaults, @_ );
153
154  # This becomes our pseudo-object
155  my $data;
156
157  # If we are appending to previous data we simply need
158  # to read that in to initialise the arrays
159  if ($opts{APPEND} && -e $opts{FILE}) {
160
161    $data = _readdata( $opts{FILE} );
162
163  } else {
164    # Initialise these arrays with zeroes to be -w clean
165    # Calculate the size
166    my $nkeys = (scalar keys %letters) - 1;
167
168    # Create the data structure with 3 arrays
169    $data = {
170	     FIRST => [], # Occurence of a starting letter
171	     SECOND => [],# Occurences of the first 2 letter combos
172	     THIRD => [], # Occurences of 3 letter combinations
173	    };
174
175    # Not appending, so we need to presize and fill with zeroes
176    # presize then initialise with map
177    $data->{FIRST}[$nkeys] = 0;
178    @{ $data->{FIRST} } = map { 0 } @{ $data->{FIRST} };
179    for my $i (0..$nkeys) {
180      $data->{SECOND}[$i][$nkeys] = 0;
181      @{ $data->{SECOND}[$i] } = map { 0 } @{ $data->{SECOND}[$i] };
182
183      for my $j ( 0..$nkeys ) {
184	$data->{THIRD}[$i][$j][$nkeys] = 0;
185	@{ $data->{THIRD}[$i][$j] } = map { 0 } @{ $data->{THIRD}[$i][$j] }
186      }
187    }
188  }
189
190  # Open the dictionary file
191  open( LISTOWORDS, $opts{DICT}) or
192    do { $ERRSTR = "Could not open dictionary file $opts{DICT}: $!"; return 0};
193
194  # Now read a line at a time from the file
195  while (<LISTOWORDS>){
196    my @words = split ;
197    for my $word (@words){
198      next if $word !~ /^[a-z]+$/i || length($word) < 3;
199      $word = uc($word);
200      # Split the word into letters
201      my @temlets = split //,$word;
202
203      # increment the freq. of the first two letters of the word
204      $data->{FIRST}[ $letters{$temlets[0]} ]++;
205
206      # Divide everything by 2 if we are becoming too large
207      _scale_first_down( $data->{FIRST} )
208	if $data->{FIRST}[$letters{$temlets[0]} ] > MAXN;
209
210      $data->{SECOND}[ $letters{$temlets[0]} ][ $letters{$temlets[1]} ]++;
211
212      # Divide everything by 2 if we are becoming too large
213      _scale_seconds_down( $data->{SECOND} )
214	if $data->{SECOND}[$letters{$temlets[0]}][$letters{$temlets[1]}] > MAXN;
215
216      # look at letter freq for rest of the word
217      for my $j (2 .. $#temlets){
218	$data->{THIRD}[$letters{$temlets[$j-2]}][$letters{$temlets[$j-1]}][$letters{$temlets[$j]}]++;
219
220	# Divide everything by 2 if we are becoming too large
221	_scale_thirds_down( $data->{THIRD} )
222	  if $data->{THIRD}[$letters{$temlets[$j-2]}][$letters{$temlets[$j-1]}][$letters{$temlets[$j]}] > MAXN;
223      }
224    }
225  }
226
227  # Close dictionary file
228  close( LISTOWORDS ) or
229    do {
230      $ERRSTR = "Could not close dictionary file $opts{DICT}: $!";
231      return 0;
232    };
233
234  # Precalculate the totals - this is a trade off of
235  # disk space versus speed and disk space is cheap (and this
236  # is not a very large array anyway
237  _calctotals( $data );
238
239  # Now store the data in the output file
240  _storedata( $data, , $opts{FILE} ) or
241    do {
242      $ERRSTR = "Error storing data to $opts{FILE}";
243      return;
244    };
245
246  return 1;
247}
248
249=item B<passgen>
250
251Generate a password.
252
253  @words = passgen( %options );
254
255Argument is a hash with the following keys:
256
257 FILE   The filename containing the frequency information. Must
258        have been written using C<ingest>.
259 NLETT  Number of letters to use for the generated password.
260        Must be at least 5
261 NWORDS Number of passwords to generate
262
263An array of passwords is returned. An empty list is returned
264if an error occurs (and $Crypt::PassGen::ERRSTR is set to
265the reason).
266
267=cut
268
269sub passgen {
270
271  my %defaults = (
272		  FILE => $DEFAULT_FREQFILE,
273		  NLETT => 8,
274		  NWORDS => 1,
275		 );
276
277  my %opts = (%defaults, @_);
278
279  # Return if NLETT is too short
280  if ($opts{NLETT} < 5) {
281    $ERRSTR = 'A password must be at least 5 letters';
282    return ();
283  }
284
285  # Read in the data
286  my $data = _readdata( $opts{FILE} );
287
288  # Calculate the minimum score
289  my $minscore = _calcminscore( $data, $opts{NLETT} );
290
291  # Generate the required number of passwords
292  my @WORDS;
293  for my $n ( 1..$opts{NWORDS} ) {
294
295    push(@WORDS, _generate( $data, $opts{NLETT}, $minscore )) ;
296
297  }
298
299  return @WORDS;
300}
301
302
303# internal routines
304
305# Generate a password
306# Arguments: data 'object', number of letters, minimum score
307# returns a lower-cased password
308
309sub _generate ($$) {
310  my ($data, $nlett, $minscore) = @_;
311
312  # Need to loop round until we reach minimum score
313  my $score = 0;
314  my $n = 0;
315  my $word;
316
317  WORDLOOP: while ($score < $minscore || length($word) < $nlett) {
318
319    # reset current score
320    $score = 0;
321
322    # Keep track of the number of times around
323    $n++;
324    if ($n > 100) {
325      $n = 0;
326      $minscore *= 0.75;
327    }
328
329    # These calculations could all be prettified off into a sub
330    # Now pick letters at random (starting with the first)
331    my $ind = _tot_to_index(int(rand( $data->{FIRST_TOT} )), $data->{FIRST} );
332    next WORDLOOP if $ind < 0;
333    $word = $revlett{ $ind };
334    $score= $data->{FIRST}[ $ind ];
335    my $prev1 = $ind;
336
337    # Now the second letter
338    $ind = _tot_to_index( int(rand( $data->{SECOND_TOT}[$prev1] )),
339			  $data->{SECOND}[ $prev1 ]);
340    next WORDLOOP if $ind < 0;
341    my $prev2 = $ind;
342    $score += $data->{SECOND}[ $prev1 ][ $prev2 ];
343    $word .= $revlett{ $ind };
344
345    # Loop until we get the required number of letters
346    for my $i ( 3.. $nlett ) {
347      $ind = _tot_to_index( int(rand( $data->{PAIR_TOT}[$prev1][$prev2] )),
348			    $data->{THIRD}[$prev1][$prev2] );
349      next WORDLOOP if $ind < 0;
350      $score += $data->{THIRD}[$prev1][$prev2][$ind];
351      $word .= $revlett{ $ind };
352      $prev1 = $prev2; # store the previous two letters
353      $prev2 = $ind;
354    }
355
356  }
357
358  return lc($word);
359
360}
361
362# store the frequency data to disk
363# Arguments:
364#    Data to store ( the 'object' )
365#    Output filename
366#    Append or not
367
368# Returns: 1 (good), 0 (bad)
369
370sub _storedata ($$) {
371  my ($data, $file ) = @_;
372
373  # Now simply write the data in network order
374  nstore( $data, $file );
375}
376
377# Read the data
378# Arguments:  filename
379# Returns :  the data (undef on error)
380# The data is a hash with keys FIRST, SECOND, THIRD
381# The data is cached to prevent reading the frequency
382# table from disk each time -- 99.9% of the time we will
383# be reading from the same file and the memory overhead
384# of keeping the cache open is insignificant
385# The cache is keyed by the filename but can not tell that
386# file /a/b/c is the same as 'b/c'.
387
388# This is a 'constructor'
389
390sub _readdata {
391  my $file = shift;
392  if ( exists $CACHE{ $file } ) {
393    return $CACHE{ $file };
394  } else {
395    my $data = retrieve( $file );
396    $CACHE{ $file } = $data;
397    return $data;
398  }
399}
400
401# Divide everything in a 1-D array by 2
402
403sub _scale_first_down {
404  use integer;
405  my $arr = shift;
406  for ( @$arr ) {
407    $_ /= 2;
408  }
409}
410
411# Divide everything in a 2-D array by 2
412
413sub _scale_seconds_down {
414  use integer;
415  my $arr = shift;
416  for my $i (@$arr) {
417    for my $j (@$i) {
418      $j /= 2;
419    }
420  }
421}
422
423# Divide everything in a 3-D array by 2.0
424
425sub _scale_thirds_down {
426  use integer;
427  my $arr = shift;
428  for my $i (@$arr) {
429    for my $j (@$i) {
430      for my $k (@$j) {
431	$k /= 2;
432      }
433    }
434  }
435}
436
437# Calculate the totals
438# Effectively calculates the total weight for each letter combination
439# Argument: hash reference containing FIRST, SECOND and THIRD
440#
441# Adds the following keys FIRST_TOT (scalar), SECOND_TOT, PAIR_TOT
442# (array refs) which are the totals for each letter combination.
443# and AVFIRST, AVSECOND and AVHTHIRD (the average occurence related
444
445sub _calctotals {
446  my $data = shift;
447
448  my ( $nfirst, $nsec, $nthird );
449  my ( $second_fullsum, $third_fullsum );
450
451  # Get the size (yes I know it is 26-1)
452  my $size = $#{ $data->{FIRST} };
453
454  # Init
455  $data->{FIRST_TOT} = 0;
456  $data->{SECOND_TOT} = [];
457  @{ $data->{SECOND_TOT} } = map { 0 } (0..$size);
458  for my $i ( 0.. $size ) {
459    $data->{PAIR_TOT}[$i] = [];
460    @{ $data->{PAIR_TOT}[$i] } = map { 0 } (0..$size);
461  }
462
463  # Loop over all members summing up
464  for my $i ( 0 .. $size ) {
465    $data->{FIRST_TOT} += $data->{FIRST}[$i];
466    $nfirst++ if $data->{FIRST}[$i];
467    for my $j ( 0 .. $size ) {
468      $data->{SECOND_TOT}[$i] += $data->{SECOND}[$i][$j];
469      $nsec++ if $data->{SECOND}[$i][$j];
470      $second_fullsum += $data->{SECOND}[$i][$j];
471      for my $k ( 0 .. $size ) {
472	$data->{PAIR_TOT}[$i][$j] += $data->{THIRD}[$i][$j][$k];
473	$nthird++ if $data->{THIRD}[$i][$j][$k];
474	$third_fullsum += $data->{THIRD}[$i][$j][$k];
475      }
476    }
477  }
478
479  # Calculate the average none zero occurence
480  $data->{AVFIRST} = $data->{FIRST_TOT} / $nfirst;
481  $data->{AVSECOND} = $second_fullsum / $nsec;
482  $data->{AVTHIRD} = $third_fullsum / $nthird;
483
484}
485
486# Calculate the minimum score. When each letter is selected
487# its occurence value is added to a score. The minimum score
488# criterion decides whether the generated password is good enough
489# or needs to be regenerated. It is simply the sum of the
490# average occurences for each letter multiplied by 3.
491
492# Arguments: Data hash, length of required password
493
494sub _calcminscore {
495  my $data = shift;
496  my $length = shift;
497
498  my $score = 3 * ( $data->{AVFIRST} + $data->{AVSECOND} +
499    ( ( $length - 2 ) * $data->{AVTHIRD} ) );
500
501  return $score;
502}
503
504# Translate a position in the TOT array to an index in the corresponding
505# array
506
507# Arguments: Total, array ref to be searched
508# Returns: pos
509#          -1 if no index could be determined
510
511sub _tot_to_index {
512  my ($tot, $arr) = @_;
513  my $i=0;
514  while ($tot >= 0 && $i <= $#$arr) {
515    $tot -= $arr->[ $i ];
516#    print "Tot now: $tot\t $i ",$revlett{$i}," ",$arr->[$i],"\n";
517    $i++;
518  }
519
520  # if we are still >= 0 we could not match an index
521  return -1 if $tot >= 0;
522
523  # Found a valid index
524  return --$i;
525}
526
527
528=back
529
530=head1 ERROR HANDLING
531
532All routines in this module store errors in the ERRSTR
533variable. This variable can be accessed if the routines
534return an error state and contains the reason for the error.
535
536  @words = passgen( NLETT => 2 )
537    or die "Error message: $Crypt::PassGen::ERRSTR";
538
539=head1 AUTHORS
540
541Tim Jenness E<lt>tjenness@cpan.orgE<gt> Copyright (C)
5422000-2012 T. Jenness. All Rights Reserved.  This program is free
543software; you can redistribute it and/or modify it under the same
544terms as Perl itself.
545
546Based on the PASSGEN program written by Mike Bartman of SAR, Inc as
547part of the SPAN security toolkit.
548
549=cut
550
551
5521;
553