src/muscle/fastdistkmer.cpp

#include "muscle.h"
#include "msa.h"
#include "seqvect.h"
#include "seq.h"
#include "distfunc.h"
#include <math.h>
#include "muscle_context.h"

#define TRACE	0

/***
Some candidate alphabets considered because they
have high correlations and small table sizes.
Correlation coefficent is between k-mer distance
and %id D measured from a CLUSTALW alignment.
Table size is N^k where N is size of alphabet.
A is standard (uncompressed) amino alphabet.

                           Correlation
Alpha   N  k  Table Size   all   25-50%
-----  --  -  ----------   ----  ------
A      20  3       8,000  0.943   0.575
A      20  4     160,000  0.962   0.685 <<
LiA    14  4      38,416  0.966   0.645
SEB    14  4      38,416  0.964   0.634
LiA    13  4      28,561  0.965   0.640
LiA    12  4      20,736  0.963   0.620
LiA    10  5     100,000  0.964   0.652

We select A with k=4 because it has the best
correlations. The only drawback is a large table
size, but space is readily available and the only
additional time cost is in resetting the table to
zero, which can be done quickly with memset or by
keeping a list of the k-mers that were found (should
test to see which is faster, and may vary by compiler
and processor type). It also has the minor advantage
that we don't need to convert the alphabet.

Fractional identity d is estimated as follows.

	F = fractional k-mer count
	if F is 0: F = 0.01
	Y = log(0.02 + F)
	d = -4.1 + 4.12*Y

The constant 0.02 was chosen to make the relationship
between Y and D linear. The constants -4.1 and 4.12
were chosen to fit a straight line to the scatterplot
of Y vs D.
***/

#define MIN(x, y)	(((x) < (y)) ? (x) : (y))

const unsigned K = 4;
const unsigned N = 20;
const unsigned N_2 = 20*20;
const unsigned N_3 = 20*20*20;
const unsigned N_4 = 20*20*20*20;

const unsigned TABLE_SIZE = N_4;

// For debug output
// warning: unsafe
//const char *KmerToStr(unsigned Kmer)
//	{
//    MuscleContext *ctx = getMuscleContext();
//    char* g_LetterToChar = ctx->alpha.g_LetterToChar;
//	static char s[5];
//
//	unsigned c3 = (Kmer/N_3)%N;
//	unsigned c2 = (Kmer/N_2)%N;
//	unsigned c1 = (Kmer/N)%N;
//	unsigned c0 = Kmer%N;
//
//	s[0] = LetterToChar(c3);
//	s[1] = LetterToChar(c2);
//	s[2] = LetterToChar(c1);
//	s[3] = LetterToChar(c0);
//	return s;
//	}

void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[])
	{
#if	TRACE
	Log("CountKmers\n");
#endif
	memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte));

	const byte *ptrKmerStart = s;
	const byte *ptrKmerEnd = s + 4;
	const byte *ptrSeqEnd = s + uSeqLength;

	unsigned c3 = s[0]*N_3;
	unsigned c2 = s[1]*N_2;
	unsigned c1 = s[2]*N;
	unsigned c0 = s[3];

	unsigned Kmer = c3 + c2 + c1 + c0;

	for (;;)
		{
		assert(Kmer < TABLE_SIZE);

#if	TRACE
		//Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer));
#endif
		++(KmerCounts[Kmer]);

		if (ptrKmerEnd == ptrSeqEnd)
			break;

	// Compute k-mer as function of previous k-mer:
	// 1. Subtract first letter from previous k-mer.
	// 2. Multiply by N.
	// 3. Add next letter.
		c3 = (*ptrKmerStart++) * N_3;
		Kmer = (Kmer - c3)*N;
		Kmer += *ptrKmerEnd++;
		}
	}

unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength,
  const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2)
	{
	byte KmerCounts2[TABLE_SIZE];
	CountKmers(Seq2, uSeqLength2, KmerCounts2);

	const byte *ptrKmerStart = Seq;
	const byte *ptrKmerEnd = Seq + 4;
	const byte *ptrSeqEnd = Seq + uSeqLength;

	unsigned c3 = Seq[0]*N_3;
	unsigned c2 = Seq[1]*N_2;
	unsigned c1 = Seq[2]*N;
	unsigned c0 = Seq[3];

	unsigned Kmer = c3 + c2 + c1 + c0;

	unsigned uCommonCount = 0;
	for (;;)
		{
		assert(Kmer < TABLE_SIZE);

		const byte Count1 = KmerCounts1[Kmer];
		const byte Count2 = KmerCounts2[Kmer];

		uCommonCount += MIN(Count1, Count2);

	// Hack so we don't double-count
		KmerCounts2[Kmer] = 0;

		if (ptrKmerEnd == ptrSeqEnd)
			break;

	// Compute k-mer as function of previous k-mer:
	// 1. Subtract first letter from previous k-mer.
	// 2. Multiply by N.
	// 3. Add next letter.
		c3 = (*ptrKmerStart++) * N_3;
		Kmer = (Kmer - c3)*N;
		Kmer += *ptrKmerEnd++;
		}
	return uCommonCount;
	}

static void SeqToLetters(const Seq &s, byte Letters[])
	{
    MuscleContext *ctx = getMuscleContext();
    bool* g_IsWildcardChar = ctx->alpha.g_IsWildcardChar;
    unsigned* g_CharToLetter = ctx->alpha.g_CharToLetter;

	const unsigned uSeqLength = s.Length();
	for (unsigned uCol = 0; uCol < uSeqLength; ++uCol)
		{
		char c = s.GetChar(uCol);
	// Ugly hack. My k-mer counting code isn't wild-card
	// aware. Arbitrarily replace wildcards by a specific
	// amino acid.
		if (IsWildcardChar(c))
			c = 'A';
		*Letters++ = CharToLetter(c);
		}
	}

void FastDistKmer(const SeqVect &v, DistFunc &DF)
	{
	byte KmerCounts[TABLE_SIZE];

	const unsigned uSeqCount = v.GetSeqCount();

	DF.SetCount(uSeqCount);
	if (0 == uSeqCount)
		return;

// Initialize distance matrix to zero
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		DF.SetDist(uSeq1, uSeq1, 0);
		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
			DF.SetDist(uSeq1, uSeq2, 0);
		}

	unsigned uMaxLength = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		const Seq &s = v.GetSeq(uSeqIndex);
		unsigned uSeqLength = s.Length();
		if (uSeqLength > uMaxLength)
			uMaxLength = uSeqLength;
		}
	if (0 == uMaxLength)
		return;

	byte *Seq1Letters = new byte[uMaxLength];
	byte *Seq2Letters = new byte[uMaxLength];

	for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1)
		{
		const Seq &s1 = v.GetSeq(uSeqIndex1);
		const unsigned uSeqLength1 = s1.Length();

		SeqToLetters(s1, Seq1Letters);
		CountKmers(Seq1Letters, uSeqLength1, KmerCounts);

		for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount;
		  ++uSeqIndex2)
			{
			const Seq &s2 = v.GetSeq(uSeqIndex2);
			const unsigned uSeqLength2 = s2.Length();

			SeqToLetters(s2, Seq2Letters);

			unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1,
			  KmerCounts, Seq2Letters, uSeqLength2);

			unsigned uMinLength = MIN(uSeqLength1, uSeqLength2);
			double F = (double) uCommonKmerCount / (uMinLength - K + 1);
			if (0.0 == F)
				F = 0.01;
//			double Y = log(0.02 + F);
//			double EstimatedPctId = Y/4.12 + 0.995;
//			DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD);
			DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F));
#if	TRACE
            double KD = KimuraDist(EstimatedPctId);
			Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n",
			  uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD);
#endif
			}
		}

	delete[] Seq1Letters;
	delete[] Seq2Letters;
	}