src/nnet/traindata.h

/** @file traindata.h

  @brief CTrainingData header - Neural Network Training Data

  @author Jakub Ad�mek
  Last modified $Id: traindata.h,v 1.21 2002/04/22 15:43:23 jakubadamek Exp $
*/

#ifndef __BANG_NNDATA_H__
#define __BANG_NNDATA_H__

#include "utils.h"
#include "xmlstream.h"
#include "assert.h"

SMALLVECTOR (double, CFloat, TSmallVectorFloat)
VECTOR (CFloat, TVectorFloat)
//typedef TVectorFloat TSmallVectorFloat;
VECTOR (CInt, TVectorInt)
MAP (CString, CFloat, TMapStringFloat)

/** @brief training pattern - inputs, desired outputs and set */

enum enumSetType { ST_NULL=-1, ST_EVAL=0, ST_TRAIN=1 };
static const TEnumString SetTypes = "eval;train";

VECTOR (TStream, TStreams)

STRUCT_BEGIN (TPattern)
public:
	void Init ()	{ set = ST_NULL; series=0; }
	/// data[CU_INPUT], data[CU_OUTPUT]
	TSmallVectorFloat data[2];
	/// which set contains this pattern - see enumSetType
	CInt set;
	/// which series contains this pattern
	CInt series;
	/// the rows are sorted by sets
	bool operator < (const TPattern &pat) const
	{ return pat.set < set; }
STRUCT_END (TPattern)

VECTOR (TPattern, TPatterns)

STRUCT_BEGIN (TTrainingDataFile)
public:
	/// streams containing data (to be joined from left to right)
	TStreams streams;
STRUCT_END (TTrainingDataFile)

VECTOR (TTrainingDataFile, TTrainingDataFiles)

/** C O L U M N   S T A F F */

/** Types of values */
enum filterType {
	/// boolean - two different values, e.g. 0/1,yes/no
	FT_BOOL,
	/// integer number
	FT_INT,
	/// floating point number
	FT_FLOAT,
	/// category (text value), each row contains one
	FT_CATEGORY,
	/// category (text value), each row contains any number of them (0-n)
	FT_MULTICATEGORY
};

static const TEnumString FTs = "bool;int;float;category;multicategory";

/** How to translate values in column */
enum filterTranslate {
	/** Default for everything except multi-category
		- linearly transform from [min;max] into [-1;1]
		For categories: assign order values (0,1,2,...) to them and linearly transform.
		Not usable for multi-categories.
	*/
	FTR_FLOAT,
	/** uses the parameters "avg" and "stdev": x -> (x - avg) / stdev */
	FTR_LINEAR,
	/** For category / multi-category - assign one input 0/1 to every possible value */
	FTR_BOOLS,
	/** Don't translate - use values as they are */
	FTR_NONE
};

static const TEnumString FTRs = "float;linear;bools;none";


STRUCT_BEGIN (TFilter)
	void Init ()	{ type=FT_FLOAT; translate=FTR_FLOAT; minR=0; maxR=0;
			  toMinR=-1; toMaxR=1; toAvg=0; toStdev=1; avg=0; stdev=1;
 			  digits=-1; }


	/// see enum filterType
	CInt type;
	/// see enum filterTranslate
	CInt translate;
	/// interval to transform from by FLOAT
	CFloat minR, maxR;
	/// interval to transform to by FLOAT (usually [-1;+1])
	CFloat toMinR, toMaxR;
	/// average and standard deviation for LINEAR transformation
	CFloat avg, stdev;
	/// average and standard deviation after LINEAR transformation (usually 0,1)
	CFloat toAvg, toStdev;
	/// categories
	TEnumString categories;
	/// for doubles used as categories - number of decimal digits when converting to string
	CInt digits;
	/// number of outputs when preprocessing
	inline int getWidth() const;

	/** PRE- and POST- PROCESSING
		Following functions all use the last argument to place results.
		That's because some of them may place more than one results and also
		the result type may be specified by overloading, not by function name. */

	/// returns preprocessed value (or array of values on FTR_BOOLS)
	inline void preprocess (double value, double &result);
	/// returns preprocessed value (or array of values on FTR_BOOLS)
	inline void preprocess (const CString &value, double &result);

	/** returns postprocessed double value.	Inverse to preprocess. */
	inline void postprocess (const double &value, double &result) const;
	/** returns postprocessed string value. Inverse to preprocess. */
	inline void postprocess (const double &value, CString &result) const;

	/** returns postprocessed double value, but doesn't add the average - only multiplies */
	inline void postprocessError (const double &value, double &result) const;
STRUCT_END (TFilter)

/** Usage of column */
enum colUse {
	/// input
	CU_INPUT,
	/// output
	CU_OUTPUT,
	/// not used
	CU_NO
};

static const TEnumString CUs = "input;output;no";

STRUCT_BEGIN_FROM (TColumn, TFilter)
	void Init ()	{ use=CU_NO; pos = 0; }
	/// see enum colUse
	CInt use;
	/// starting position in the input or output vector
	CInt pos;

	/// window - use only for series: take appropriate values from neighbours
	STRUCT_BEGIN (TWindow)
		void Init()	{ left = right = 0; empty = "0"; }

		CInt left, right;
		/// default value when no such neighbours exist (e.g. left from 1st)
		CString empty;
	STRUCT_END (TWindow)
	TWindow window;

	inline int getColWidth() const;

	/** Returns appropriate input or output column value. For CTR_BOOLS returns the first value.
		"shift" relates to window, ranges in [-left;+right] */
	inline double &val (TPattern &pat, int shift=0) const;
	inline double val (const TPattern &pat, int shift=0) const;
	inline void copy (const TPattern &src, int shiftSrc, TPattern &dst, int shiftDst);
STRUCT_END_FROM (TColumn, TFilter)
VECTOR (TColumn, TColumns)


/** @brief Controls manipulation with training data. Allows to define sets of training data.

	This class has an iterator holding current position in the data. You can move it with move....
*/

STRUCT_BEGIN (CTrainingData)
public:
	void Init ()	{ pos=data.end(); series=false; colCount[CU_INPUT]=colCount[CU_OUTPUT]=0; }

	/// Moves position to beginning. Returns: true if position valid.
	bool moveFirst () const					{ return (pos = data.begin()) != data.end(); }
	bool moveNext () const					{ return ++pos != data.end(); }
	/** Moves position to next row. Returns: true if position valid.
		Inline to be quickly. */
	bool moveNext (CInt set) const;
	bool movePrev (CInt set) const;

	/// Moves position to start of given set. Returns: true if position valid
	bool moveToSetStart (CInt setType) const;

	/// Moves position to the row of a given index. Returns: true if position valid
	bool moveToRow (CInt row) const;

	/// getSet returns: set of current row or ST_NULL if position not valid
	inline CInt getSet () const
	{ if (pos == data.end()) return ST_NULL; else return (*pos).set; }

	/// getInputs returns: inputs on current row
	inline const TSmallVectorFloat &getInputs () const	{ return pos->data[CU_INPUT]; }
	/// getOutputs returns: desired outputs on current row
	inline const TSmallVectorFloat &getOutputs () const	{ return pos->data[CU_OUTPUT]; }
	inline TSmallVectorFloat &modifyOutputs ()		{ return pos->data[CU_OUTPUT]; }

	inline const TPattern &getPattern () const		{ return *pos; }
	inline int getSeries () const
	{ if (pos == data.end()) return -1; else return (int)pos->series; }
	inline TPatterns::iterator getPos () const		{ return pos; }
	inline void setPos (TPatterns::iterator x) const	{ pos = x; }

	STRUCT_BEGIN (TRange)
		void Init ()		{ from=0; to=0; random=0; rest=false; type=ST_NULL; }
		/** negative values mean size in percent; to = 0 means until the end of file */
		CInt from, to;
		/** Number of lines to be randomly chosen from this range. Zero means don't use random. */
		CInt random;
		/** Bool: Use the rest of the range - useful especially when another set uses random lines */
		CInt rest;
		/// type of data set - see enumSetType
		CInt type;
	STRUCT_END (TRange)
	VECTORC (TRange, TRanges)

	/// goes through ranges and sets the set info to rows. Returns error or empty string.
	CString processRanges ();

	/// goes through all data and columns and sets the window results
	void processWindows ();

	const TColumns &getColumns () const	{ return columns; }
	CInt getColCount(colUse use) const	{ assert (use==CU_INPUT || use==CU_OUTPUT); return colCount[use]; }
	CInt getRowCount(enumSetType set) const	{ return rowCount[set]; }
	void addColumn (TColumn col);
	void addFile (const TTrainingDataFile &f){ files.push_back (f); }

	/// sets directly the column value (caller must know about columns), uses the column translation
	//inline void setColDirect (colUse use, int icol, double value);
	/** returns double column value translated from direct values */
	inline void getPostprocessed (colUse use, int col, double &result);
	/** returns string column value translated from direct values - not yet implemented */
	inline void getPostprocessed (colUse use, int col, CString &result);

	/// returns input or output column of given index (starting with 0)
	const TColumn &getColumn (colUse use, int col) const;

	/// where to dump configuration (including columns) after processing data - if empty, nowhere
	CString dumpCfg;
	/// bool: go to through columns and set their translations or not?
	CInt examineColumns;

	/// finds minR, maxR, avg, stdev for all columns
	void examine ();

	/// sets the colIndexes array
	void setColIndexes ();

	int getSeriesCount(enumSetType set) const;

	/// deletes all rows
	void clear ()	{ data.DeleteAll(); pos = data.end(); }
	/// adds new row - set values by fillColumn functions
	void addRow (enumSetType set, int series = 0);
	/// preprocesses and sets value on current row
	inline void fillColumn (colUse use, int col, double value);
	/// preprocesses and sets value on current row
	inline void fillColumn (colUse use, int col, const CString &value);

	TRanges ranges;

	/** indexes of columns - e.g. columns' use is CU_NO,CU_INPUT,CU_OUTPUT,...
		=> colIndexes[CU_INPUT][0]=1, colIndexes[CU_OUTPUT][0]=2 */
	TVectorInt colIndexes[2];

	/// holds all the training data
	TPatterns data;
	/// guards the data when a neural network uses them
	Mutex dataMutex;
	/// row count for training and eval set
	CInt rowCount [2];
	/// columns count for input and output cols
	CInt colCount[2];
	TTrainingDataFiles files;
	/// columns description
	TColumns columns;
	/** Bool: Are data organised into series? */
	CInt series;
	/** If a row begins with this, it separates two series */
	CString seriesSeparator;

private:
	/** reads data from files - returns errors if any occur. The first param is the config file name. */
	CString readFromFiles (CString filename, bool onlyCategories = false);
	mutable TPatterns::iterator pos;

public:
	/* * * * * * * * * * * * * * * * * * * * * * * * * * * * */
	/*       Configuration READing and PRINTing              */

	/// dumps all data to the string
	void dumpData (CString &out);

	/** reads all data and sets columns - returns error description or empty string.
		First param is the config file name. */
	CString readData(CString filename);

	/// prints the cfg and the data as local in the file
	CRox *printAllLocal ();

	CRox *printColumns () const;
	CString readColumns (const CRox *xml);

STRUCT_END (CTrainingData)

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *                  C T r a i n i n g D a t a                    *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

inline bool CTrainingData::moveNext (CInt set) const
{
	if (pos == data.end()) return false;
	do ++pos;
	while (pos != data.end() && pos->set != set);
	return pos != data.end();
}

inline bool CTrainingData::movePrev (CInt set) const
{
	if (pos == data.begin()) return false;
	do --pos;
	while (pos != data.begin() && pos->set != set);
	return pos->set == set;
}

inline void CTrainingData::fillColumn (colUse use, int col, double value)
{
	TColumns::iterator icol = columns.begin()+colIndexes[use][col];
	icol->preprocess (value, icol->val( *pos ));
}

inline void CTrainingData::getPostprocessed (colUse use, int col, double &result)
{
	TColumns::iterator icol = columns.begin()+colIndexes[use][col];
	icol->postprocess (icol->val (*pos), result);
}

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *                        T F i l t e r		                 *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

inline void TFilter::preprocess (double value, double &result)
{
	switch ((int)translate) {
	case FTR_FLOAT: // transform into [toMinR;toMaxR]
		if (maxR == minR) result = 0;
		else result = (value - minR) * (toMaxR - toMinR) / (maxR - minR) + toMinR; break;
	case FTR_LINEAR:
		if (stdev == 0) result = 0;
		else result = (value - avg) * toStdev / stdev + toAvg; break;
	case FTR_NONE:
		result = value; break;
	case FTR_BOOLS:
		if ((int)type == FT_CATEGORY) {
			preprocess (toString (value,digits), result);
			break;
		}
	default: // not yet implemented
		assert (false);
		result = 0; break;
	}
}

inline void TFilter::preprocess (const CString &value, double &result)
{
	switch ((int)type) {
	case FT_INT:
	case FT_FLOAT:
		preprocess (atof (value.c_str()), result); break;
	case FT_CATEGORY: {
		int category = categories[value];
		switch ((int)translate) {
		case FTR_FLOAT:
		case FTR_LINEAR:
			preprocess (category, result);
			break;
		case FTR_BOOLS: {
			for (int i=0; i < categories.size(); ++i)
				(&result)[i] = i == category ? 1 : 0;
			break;	}
		default: assert (false); //not yet implemented
		}
		break;	  }
	default: assert (false); //not yet implemented
	}
}

inline void TFilter::postprocessError (const double &value, double &result) const
{
	switch ((int)translate) {
	case FTR_FLOAT:
		result = value * (maxR - minR) / (toMaxR - toMinR); break;
	case FTR_LINEAR:
		result = value * stdev / toStdev; break;
	case FTR_NONE:
		result = value; break;
	default: // not yet implemented
		assert (false);
		result = 0; break;
	}
}

inline void TFilter::postprocess (const double &value, double &result) const
{
	switch ((int)translate) {
	case FTR_FLOAT:
		result = (value - toMinR) * (maxR - minR) / (toMaxR - toMinR) + minR; break;
	case FTR_LINEAR:
		result = (value - toAvg) * stdev / toStdev + avg; break;
	case FTR_NONE:
		result = value; break;
	default: // not yet implemented
		assert (false);
		result = 0; break;
	}
}

inline void TFilter::postprocess (const double &value, CString &result) const
{
	switch ((int)translate) {
	case FTR_FLOAT:
	case FTR_LINEAR: {
		double dresult;
		postprocess (value, dresult);
		result = toString (dresult);
		break; }
	case FTR_BOOLS: {
		assert (type == FT_CATEGORY);
		int i;
		for (i=0; i < categories.size(); ++i)
			if ((&value)[i] == 1) {
				result = categories[i];
				return;
			}
		result = -1;
		break; }
	case FTR_NONE:
		result = toString (value);
		break;
	default: // not yet implemented
		assert (false);
		result = "";
		break;
	}
}

inline int TFilter::getWidth() const
{
	if (translate != FTR_BOOLS)
		return 1;
	else return categories.size();
}

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                               *
 *                         T C o lu m n		                 *
 *                                                               *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

inline const TColumn &CTrainingData::getColumn (colUse use, int icol) const
{
	assert (use == CU_INPUT || use == CU_OUTPUT);
	return columns[colIndexes[use][icol]];
}

inline double &TColumn::val (TPattern &pat, int shift) const
{
	assert (shift >= -window.left && shift <= window.right && use != CU_NO);
	return pat.data[use][pos + (window.left+shift) * TFilter::getWidth()];
}

inline double TColumn::val (const TPattern &pat, int shift) const
{
	assert (shift >= -window.left && shift <= window.right && use != CU_NO);
	return pat.data[use][pos + (window.left+shift) * TFilter::getWidth()];
}

inline void TColumn::copy (const TPattern &src, int shiftSrc, TPattern &dst, int shiftDst)
{
	assert (shiftSrc >= -window.left && shiftSrc <= window.right && use != CU_NO
		&& shiftDst >= -window.left && shiftDst <= window.right);
	if (use == CU_NO) return;
	for (int i=0; i < TFilter::getWidth(); ++i)
		dst.data[use][pos + (window.left+shiftDst) * TFilter::getWidth() + i]
		= src.data[use][pos + (window.left+shiftSrc) * TFilter::getWidth() + i];
}

inline int TColumn::getColWidth() const
{
	return (window.left + window.right + 1) * TFilter::getWidth();
}

#endif