src/convParse/gc_migrate.cpp

// $Id: gc_migrate.cpp,v 1.33 2011/03/08 19:22:00 bobgian Exp $

/*
  Copyright 2002  Mary Kuhner, Jon Yamato, and Joseph Felsenstein

  This software is distributed free of charge for non-commercial use
  and is copyrighted.  Of course, we do not guarantee that the software
  works, and are not responsible for any damage you may cause or have.
*/

#include <cassert>

#include "gc_data.h"
#include "gc_datastore.h"
#include "gc_errhandling.h"
#include "gc_file.h"
#include "gc_file_util.h"
#include "gc_infile_err.h"
#include "gc_migrate.h"
#include "gc_parse_block.h"
#include "gc_strings_infile.h"

#include "wx/log.h"
#include "wx/tokenzr.h"
#include "wx/txtstrm.h"
#include "wx/wfstream.h"

//------------------------------------------------------------------------------------

GCMigrateParser::GCMigrateParser(const GCDataStore& ds)
    :   GCParser(ds)
{
}

GCParse *
GCMigrateParser::Parse( GCFile &            fileRef,
                        gcGeneralDataType   dataType,
                        GCInterleaving      interleaving)
{
    SetUpStreams(fileRef.GetName());
    assert( !( dataType.HasAllelic() && dataType.HasNucleic()));
    if(dataType.HasAllelic()) return AlleleParse(fileRef,dataType,interleaving);
    if(dataType.HasNucleic()) return NucParse(fileRef,dataType,interleaving);
    assert(false);
    return NULL;
}

GCParse *
GCMigrateParser::NucParse(GCFile & fileRef, gcGeneralDataType dataType, GCInterleaving interleaving)
{

    GCParse & parseData = MakeParse(fileRef,format_MIGRATE,dataType,interleaving);

    gcSpecificDataType  dataTypeSpecInFile;
    size_t              numPops;
    size_t              numLoci;
    wxString            comment;

    try
    {
        ParseMigrateFirstLine(dataTypeSpecInFile,numPops,numLoci,comment);
        SetDataTypeFromFile(parseData,dataTypeSpecInFile);

        std::vector<size_t> locusLengths = ParseMigrateLocusLengths();
        for(size_t i=0;
            i < locusLengths.size();
            i++)
        {
            AddLocus(parseData,i,locusLengths[i]);
        }

        for(size_t popIndex = 0;
            popIndex < numPops;
            popIndex++)
        {
            wxString popComment;
            std::vector<size_t> numSamples = ParseMigratePopulationInfo(popComment,locusLengths.size());
            assert(numSamples.size() == locusLengths.size());
            AddPop(parseData,popIndex,popComment);

            for(size_t locIndex = 0;
                locIndex < numLoci;
                locIndex++)
            {
                FillData(parseData,popIndex,locIndex,interleaving,numSamples[locIndex]);
            }
        }
        CheckNoExtraData();
        return &parseData;
    }
    catch(gc_eof& e)
    {
        if(CompleteParse(parseData))
        {
            return &parseData;
        }
        else
        {
            delete &parseData;
            e.setFile(fileRef.GetName());
            throw;
        }
    }
    catch(gc_infile_err& f)
    {
        delete &parseData;
        f.setFile(fileRef.GetName());
        f.setRow(m_linesRead);
        throw;
    }
    assert(false);
    return NULL;

}

GCParse *
GCMigrateParser::AlleleParse(   GCFile &            fileRef,
                                gcGeneralDataType   dataType,
                                GCInterleaving      interleaving)
{
    gcSpecificDataType  dataTypeSpecInFile;
    size_t              numPops;
    size_t              numSites;
    wxString            delimiter;
    wxString            comment;

    ParseMigrateFirstLine(dataTypeSpecInFile,numPops,numSites,delimiter,comment);
    GCParse & parseData = MakeParse(fileRef,format_MIGRATE,dataType,interleaving,delimiter);
    SetDataTypeFromFile(parseData,dataTypeSpecInFile);

    try
    {
        AddLocus(parseData,0,numSites);
        for(size_t popIndex = 0;
            popIndex < numPops;
            popIndex++)
        {
            wxString popComment;
            std::vector<size_t> numSamples = ParseMigratePopulationInfo(popComment,1);    // EWFIX.P3 -- constant
            assert(numSamples.size() == 1);
            AddPop(parseData,popIndex,popComment);
            FillData(parseData,popIndex,0,interleaving,numSamples[0]);
        }
        CheckNoExtraData();
        return &parseData;
    }
    catch(gc_eof& e)
    {
        if(CompleteParse(parseData))
        {
            return &parseData;
        }
        else
        {
            delete &parseData;
            e.setFile(fileRef.GetName());
            throw;
        }
    }
    catch(gc_infile_err& f)
    {
        delete &parseData;
        f.setFile(fileRef.GetName());
        f.setRow(m_linesRead);
        throw;
    }
    assert(false);
    return NULL;
}

GCMigrateParser::~GCMigrateParser()
{
}

void
GCMigrateParser::ParseMigrateFirstLine(
    gcSpecificDataType &        dataTypeSpecInFile,
    size_t &            numPopsRef,
    size_t &            numLociRef,
    wxString &          comment)
{
    wxString firstLine = ReadLine();
    wxStringTokenizer tokenizer(firstLine);

    dataTypeSpecInFile = sdatatype_NONE_SET;
    wxString word = tokenizer.GetNextToken();
    if(!word.IsNumber() && word.Len() == 1)
        // we're looking for an optional single char token indicating
        // the data type in this file. If it's a number, then the
        // token is not here
    {
        if(word.IsSameAs("a",false)) dataTypeSpecInFile = sdatatype_KALLELE;
        if(word.IsSameAs("e",false)) dataTypeSpecInFile = sdatatype_KALLELE;
        if(word.IsSameAs("m",false)) dataTypeSpecInFile = sdatatype_MICROSAT;
        if(word.IsSameAs("n",false)) dataTypeSpecInFile = sdatatype_SNP;
        if(word.IsSameAs("s",false)) dataTypeSpecInFile = sdatatype_DNA;
        if(dataTypeSpecInFile == sdatatype_NONE_SET)
        {
            wxString msg = wxString::Format(gcerr_migrate::firstToken,word.c_str());
            m_dataStore.GCWarning(msg);
        }
        word = tokenizer.GetNextToken();
    }

    // OK. Now word should be a number indicating the number of populations
    long longVal;
    if(!word.ToLong(&longVal))
    {
        throw gc_migrate_bad_pop_count(word);
    }
    if(longVal <= 0)
    {
        throw gc_migrate_bad_pop_count(word);
    }
    numPopsRef = (size_t)longVal;

    // The next word should be a number indicating the number of loci
    word = tokenizer.GetNextToken();
    if(!word.ToLong(&longVal) || longVal <= 0)
    {
        throw gc_migrate_bad_locus_count(word);
    }
    numLociRef = (size_t)longVal;

    comment = tokenizer.GetString();
}

void
GCMigrateParser::ParseMigrateFirstLine( gcSpecificDataType& dataTypeSpecInFile,
                                        size_t &            numPopsRef,
                                        size_t &            numLociRef,
                                        wxString &          delimiter,
                                        wxString &          comment)
{
    // this gets us the default values, which is that there is no
    // delimiter specified.
    delimiter.Empty();
    ParseMigrateFirstLine(dataTypeSpecInFile,numPopsRef,numLociRef,comment);

    wxStringTokenizer tokenizer(comment);
    if(tokenizer.HasMoreTokens())
    {
        wxString mayBeDelimiter = tokenizer.GetNextToken();
        if(IsLegalDelimiter(mayBeDelimiter))
        {
            delimiter = mayBeDelimiter;
            comment = tokenizer.GetString();
        }
    }
}

bool
GCMigrateParser::IsLegalDelimiter(wxString delimCandidate)
{
    if(delimCandidate.Length() != 1) return false;
    if(delimCandidate[0] == gcstr_migrate::missingData)
    {
        throw gc_migrate_bad_delimiter(delimCandidate);
        return false;
    }
    return true;
}

std::vector<size_t>
GCMigrateParser::ParseMigrateLocusLengths()
{
    wxString lociLengthLine = ReadLine();
    wxStringTokenizer tokenizer(lociLengthLine);
    std::vector<size_t> locusLengths;

    size_t index = 0;
    while(tokenizer.CountTokens() != 0)
    {
        wxString token = tokenizer.GetNextToken();
        long longVal;
        if(!token.ToLong(&longVal))
        {
            throw gc_migrate_locus_length_not_positive(token);
        }
        if(longVal <= 0)
        {
            throw gc_migrate_locus_length_not_positive(token);
        }
        size_t locusLength = (size_t)longVal;
        locusLengths.push_back(locusLength);
        index++;
    }
    return locusLengths;
}

std::vector<size_t>
GCMigrateParser::ParseMigratePopulationInfo(wxString & populationName, size_t locusCount)
{
    std::vector<size_t> numSamplesForEachLocus;

    wxString line = ReadLine();
    wxStringTokenizer tokenizer(line);
    wxString lastToken = wxEmptyString;
    bool shouldUseLastToken = false;

    try
    {
        for(size_t i = 0;
            i < locusCount ;
            i++)
        {
            lastToken = tokenizer.GetNextToken();
            long longVal;
            if(!lastToken.ToLong(&longVal))
            {
                throw gc_migrate_missing_sequence_count(lastToken);
            }
            if(longVal <= 0)
            {
                throw gc_migrate_bad_sequence_count(lastToken);
            }
            size_t sequenceCount = (size_t)longVal;
            numSamplesForEachLocus.push_back(sequenceCount);
        }
    }
    catch (const gc_migrate_missing_sequence_count & e)
    {
        if(numSamplesForEachLocus.size() == 1)
        {
            for(size_t i=1;
                i < locusCount;
                i++)
            {
                numSamplesForEachLocus.push_back(numSamplesForEachLocus[0]);
            }
            shouldUseLastToken = true;
        }
        else
        {
            throw gc_migrate_too_few_sequence_lengths(locusCount,line);
        }

    }
    assert(numSamplesForEachLocus.size() == locusCount);

    populationName = tokenizer.GetString();
    if(shouldUseLastToken)
    {
        populationName = wxString::Format("%s %s",
                                          lastToken.c_str(),
                                          populationName.c_str());
    }

    populationName.Trim(true);
    populationName.Trim(false);
    return numSamplesForEachLocus;
}

bool
GCMigrateParser::CompleteParse(GCParse & parseData)
{
    // check we have pops
    size_t pcount = parseData.GetPopCount();
    if(pcount < 1) return false;

    // check we have a locus
    size_t lcount = parseData.GetLociCount();
    if(lcount < 1) return false;

    // check we have a block for each
    constBlockVector blocks = parseData.GetBlocks();
    if(blocks.size() != pcount * lcount) return false;

    // check block has correct number of sequences
    constBlockVector::const_iterator i;
    for(i=blocks.begin(); i != blocks.end(); i++)
    {
        const GCParseBlock * blockP = *i;
        if(blockP == NULL) return false;
        size_t expectedNumSequences = blockP->GetExpectedNumSequences();
        const GCParseSamples & samples = blockP->GetSamples();
        if(samples.size() != expectedNumSequences) return false;

        // check block has correct number of sites
        if(blockP->HasIncompleteSequences()) return false;
    }

    return true;
}

//____________________________________________________________________________________