1#! @PERL@
2
3# Usage: fasta2gsi <seqfile>
4# Creates seqfile.gsi
5#
6# Create a .gsi sequence database index file.
7#
8# GSI allows multiple files per index, but fasta2gsi.pl
9# creates a GSI index for a single FASTA file.
10#
11# Part of the SQUID sequence analysis library.
12# Copyright (C) 1992-1996 Sean R. Eddy
13
14
15$seqfile = shift;
16$gsifile = $seqfile.".gsi";
17$tmpfile = $seqfile.".tmpgsi";
18
19# Library of Perl functions for creating GSI index files.
20#
21# GSI definition:
22#    1 + <nfiles> + <nkeys> total records.
23#    Each record = 38 bytes.
24#
25#  one header record     :  <"GSI"    (32)> <nfiles (2)> <nkeys (4)>
26#  <nfiles> file records :  <filename (32)> <fileno (2)> <fmt   (4)>
27#  <nkeys>  key records  :  <key      (32)> <fileno (2)> <offset(4)>
28#
29# Part of the SQUID sequence analysis library.
30# Copyright (C) 1992-1996 Sean R. Eddy
31
32
33# The following numbers MUST match their counterparts in squid.h
34#
35$sqd_fmt_genbank = 2;
36$sqd_fmt_embl    = 4;
37$sqd_fmt_fasta   = 7;
38$sqd_fmt_pir     = 12;
39
40# Function: GSI_WriteHeader(GSIFILE, $filenum, $keynum)
41#
42# Write the header of an open GSI file.
43#
44sub GSI_WriteHeader {
45    local(*GSIFILE, $filenum, $keynum) = @_;
46    local($header);
47    $header = pack("a32 n N", "GSI", $filenum, $keynum);
48    print GSIFILE $header;
49    1;
50}
51
52# Function: GSI_WriteFileRecord(GSIFILE, $filename, $idx, $fmt)
53#
54# Write a file record to an open GSI file.
55#
56sub GSI_WriteFileRecord {
57    local(*GSIFILE, $filename, $idx, $fmt) = @_;
58    local($record);
59    $record = pack("a32 n N", $filename, $idx, $fmt);
60    print GSIFILE $record;
61    1;
62}
63
64# Function: GSI_WriteKeyRecord(GSIFILE, $key, $filenum, $offset)
65#
66# Write a key record to an open GSI file.
67#
68sub GSI_WriteKeyRecord {
69    local(*GSIFILE, $key, $filenum, $offset) = @_;
70    local($record);
71    $record = pack("a32 n N", $key, $filenum, $offset);
72    print GSIFILE $record;
73    1;
74}
75
76
77
78
79# First pass. Create an unsorted flat text file.
80#
81$curr_offset = 0;
82$recnum      = 0;
83print "Calculating offsets for $seqfile...\n";
84open(TMPFILE,">$tmpfile");
85open(SEQFILE,$seqfile);
86while (<SEQFILE>)
87{
88    if (($key) = /^>\s*(\S+)/)
89    {
90	print TMPFILE "$key 1 $curr_offset\n";
91	$recnum++;
92    }
93    $curr_offset = tell;
94}
95close(SEQFILE);
96close(TMPFILE);
97
98# Sort the temporary file alphabetically on the key.
99print "Sorting the intermediate index file...\n";
100system("sort -o $tmpfile $tmpfile");
101
102# Second pass. Convert flat text file to binary GSI.
103#
104print "Writing the final binary GSI file...\n";
105open(GSIFILE,">$gsifile");
106&GSI_WriteHeader(GSIFILE, 1, $recnum);
107&GSI_WriteFileRecord(GSIFILE, $seqfile, 1, $sqd_fmt_fasta);
108
109open(TMPFILE,$tmpfile);
110while (<TMPFILE>)
111{
112    ($key, $filenum, $offset) = split;
113    &GSI_WriteKeyRecord(GSIFILE, $key, $filenum, $offset);
114}
115close(TMPFILE);
116close(GSIFILE);
117unlink $tmpfile;
118
119print "Complete.\n";
120print "$gsifile indexes $recnum sequence names.\n";
121