1#! @PERL@ 2 3# Usage: fasta2gsi <seqfile> 4# Creates seqfile.gsi 5# 6# Create a .gsi sequence database index file. 7# 8# GSI allows multiple files per index, but fasta2gsi.pl 9# creates a GSI index for a single FASTA file. 10# 11# Part of the SQUID sequence analysis library. 12# Copyright (C) 1992-1996 Sean R. Eddy 13 14 15$seqfile = shift; 16$gsifile = $seqfile.".gsi"; 17$tmpfile = $seqfile.".tmpgsi"; 18 19# Library of Perl functions for creating GSI index files. 20# 21# GSI definition: 22# 1 + <nfiles> + <nkeys> total records. 23# Each record = 38 bytes. 24# 25# one header record : <"GSI" (32)> <nfiles (2)> <nkeys (4)> 26# <nfiles> file records : <filename (32)> <fileno (2)> <fmt (4)> 27# <nkeys> key records : <key (32)> <fileno (2)> <offset(4)> 28# 29# Part of the SQUID sequence analysis library. 30# Copyright (C) 1992-1996 Sean R. Eddy 31 32 33# The following numbers MUST match their counterparts in squid.h 34# 35$sqd_fmt_genbank = 2; 36$sqd_fmt_embl = 4; 37$sqd_fmt_fasta = 7; 38$sqd_fmt_pir = 12; 39 40# Function: GSI_WriteHeader(GSIFILE, $filenum, $keynum) 41# 42# Write the header of an open GSI file. 43# 44sub GSI_WriteHeader { 45 local(*GSIFILE, $filenum, $keynum) = @_; 46 local($header); 47 $header = pack("a32 n N", "GSI", $filenum, $keynum); 48 print GSIFILE $header; 49 1; 50} 51 52# Function: GSI_WriteFileRecord(GSIFILE, $filename, $idx, $fmt) 53# 54# Write a file record to an open GSI file. 55# 56sub GSI_WriteFileRecord { 57 local(*GSIFILE, $filename, $idx, $fmt) = @_; 58 local($record); 59 $record = pack("a32 n N", $filename, $idx, $fmt); 60 print GSIFILE $record; 61 1; 62} 63 64# Function: GSI_WriteKeyRecord(GSIFILE, $key, $filenum, $offset) 65# 66# Write a key record to an open GSI file. 67# 68sub GSI_WriteKeyRecord { 69 local(*GSIFILE, $key, $filenum, $offset) = @_; 70 local($record); 71 $record = pack("a32 n N", $key, $filenum, $offset); 72 print GSIFILE $record; 73 1; 74} 75 76 77 78 79# First pass. Create an unsorted flat text file. 80# 81$curr_offset = 0; 82$recnum = 0; 83print "Calculating offsets for $seqfile...\n"; 84open(TMPFILE,">$tmpfile"); 85open(SEQFILE,$seqfile); 86while (<SEQFILE>) 87{ 88 if (($key) = /^>\s*(\S+)/) 89 { 90 print TMPFILE "$key 1 $curr_offset\n"; 91 $recnum++; 92 } 93 $curr_offset = tell; 94} 95close(SEQFILE); 96close(TMPFILE); 97 98# Sort the temporary file alphabetically on the key. 99print "Sorting the intermediate index file...\n"; 100system("sort -o $tmpfile $tmpfile"); 101 102# Second pass. Convert flat text file to binary GSI. 103# 104print "Writing the final binary GSI file...\n"; 105open(GSIFILE,">$gsifile"); 106&GSI_WriteHeader(GSIFILE, 1, $recnum); 107&GSI_WriteFileRecord(GSIFILE, $seqfile, 1, $sqd_fmt_fasta); 108 109open(TMPFILE,$tmpfile); 110while (<TMPFILE>) 111{ 112 ($key, $filenum, $offset) = split; 113 &GSI_WriteKeyRecord(GSIFILE, $key, $filenum, $offset); 114} 115close(TMPFILE); 116close(GSIFILE); 117unlink $tmpfile; 118 119print "Complete.\n"; 120print "$gsifile indexes $recnum sequence names.\n"; 121