1 /***************************************************************
2  Copyright (C) 2006-2014 Hewlett-Packard Development Company, L.P.
3  Copyright (C) 2014, Siemens AG
4 
5  This program is free software; you can redistribute it and/or
6  modify it under the terms of the GNU General Public License
7  version 2 as published by the Free Software Foundation.
8 
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  GNU General Public License for more details.
13 
14  You should have received a copy of the GNU General Public License along
15  with this program; if not, write to the Free Software Foundation, Inc.,
16  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
17 
18  ***************************************************************/
19 
20 /**
21  * \dir
22  * \brief Nomos agent source
23  * \file
24  * \brief Nomos header file
25  * \page nomos Nomos agent
26  * \tableofcontents
27  * \section nomosabout About
28  * Nomos does license identification using short phrases (regular expressions)
29  * and heuristics (e.g. phrase must be found in (or out of) proximity to
30  * another phrase or phrases).
31  *
32  * Nomos may also identify a "style" type of license if it has similarities
33  * with a known license type.
34  *
35  * The signatures which uniquely identify a license are stored in `STRINGS.in`
36  * file.
37  *
38  * If you wish to contribute to Nomos, please read [How to add a new license
39  * signature](https://github.com/fossology/fossology/wiki/Nomos).
40  *
41  * \subsection nomosdebug Debugging
42  * Activating the defines of PROC_TRACE and/or DOCTOR_DEBUG (see line 31 of
43  * file parse.c). Nomos generates lot of tracing information that is really
44  * helpful to debug it.
45  *
46  * `PROC_TRACE` will show you, for example, which regex's were tried and which
47  * are successful. To see the successful matches, grep the output file for
48  * "addRef".
49  *
50  * `DOCTOR_DEBUG` will show you the before and after versions of the buffer to
51  * be processed. Look in the output file or `----- [Dr-BEFORE:] -----` and
52  * `+++++ [Dr-AFTER] +++:`
53  *
54  * \section nomosactions Supported actions
55  *
56  * | Command line flag | Description |
57  * | ---: | :--- |
58  * | -i   | Initialize the database, then exit. |
59  * | -c   | Specify the directory for the system configuration. |
60  * | -l   | Print full file path (command line only). |
61  * | -v   | Verbose (-vv = more verbose). |
62  * | -J   | Output in JSON. |
63  * | -S   | Print Highlightinfo to stdout . |
64  * | file | If files are listed, print the licenses detected within them. |
65  * | no file | Process data from the scheduler. |
66  * | -V   | Print the version info, then exit. |
67  * | -d   | Specify a directory to scan. |
68  * | -n   | Spaw n - 1 child processes to run, there will be n running
69  * processes(the parent and n - 1 children). \n The default n is 2(when n is
70  * less than 2 or not setting, will be changed to 2) when -d is specified. |
71  * \section nomossource Agent source
72  *   - \link src/nomos/agent \endlink
73  *   - \link src/nomos/ui \endlink
74  *   - Functional test cases \link src/nomos/agent_tests/Functional \endlink
75  *   - Unit test cases \link src/nomos/agent_tests/Unit \endlink
76  */
77 
78 #ifndef _NOMOS_H
79 #define _NOMOS_H 1
80 #ifndef	_GNU_SOURCE
81 #define	_GNU_SOURCE
82 #endif	/* not defined _GNU_SOURCE */
83 #include <glib.h>
84 #include <stdio.h>
85 #include <assert.h>
86 #include <stdlib.h>
87 #include <search.h>
88 #include <unistd.h>
89 #include <fcntl.h>
90 #include <string.h>
91 #include <strings.h>
92 #include <ctype.h>
93 #include <dirent.h>
94 #include <ftw.h>
95 #include <regex.h>
96 #include <getopt.h>
97 #include <time.h>
98 #include <libgen.h>
99 #include <errno.h>
100 #include <sys/types.h>
101 #include <sys/stat.h>
102 #include <sys/wait.h>
103 #include <sys/mman.h>
104 #include <sys/time.h>
105 #include "nomos_gap.h"
106 #include <stdbool.h>
107 #include <semaphore.h>
108 #include <stdbool.h>
109 #include "json_writer.h"
110 
111 /** Use nomos in standalone mode (no FOSSology DB) */
112 #ifdef STANDALONE
113 #include "standalone.h"
114 #else
115 #include <libfossology.h>
116 #include <libfossdbmanager.h>
117 #endif
118 
119 /**
120  * To use our local version of debug-malloc(), compile -DMEMORY_TRACING
121  */
122 #ifdef	MEMORY_TRACING
123 #include "DMalloc.h"
124 #endif	/* MEMORY_TRACING */
125 
126 #define	PRECHECK
127 #define	GPLV2_BEATS_GPLV3
128 #define	SAVE_UNCLASSIFIED_LICENSES
129 /*#define	FLAG_NO_COPYRIGHT*/
130 
131 #ifdef	PROC_TRACE_SWITCH
132 #define	PROC_TRACE
133 #endif	/* PROC_TRACE_SWITCH */
134 
135 #define	myBUFSIZ	4096      ///< Buffer max length
136 #define	MAX_RENAME	1000    ///< Max rename length
137 #define TEMP_FILE_LEN 100   ///< Max temp file length
138 
139 /** MAX_SCANBYTES is the maximum number of bytes that will be scanned
140  * in a file.  Historically, we have never found a license more than
141  * 64k into a file.
142  */
143 #define MAX_SCANBYTES 1024*1024
144 
145 /**
146  * Program options and flags
147  *
148  * MD: I think these are used when making nomos
149  */
150 #define	OPTS_DEBUG		0x1
151 #define	OPTS_TRACE_SWITCH	0x2
152 #define OPTS_LONG_CMD_OUTPUT 0x4
153 #define OPTS_HIGHLIGHT_STDOUT 0x8
154 #define OPTS_NO_HIGHLIGHTINFO 0x10
155 #define OPTS_JSON_OUTPUT 0x20
156 #define OPTS_SCANNING_DIRECTORY 0x40
157 
158 extern char debugStr[myBUFSIZ];        ///< Debug string
159 extern char dbErrString[myBUFSIZ];     ///< DB error string
160 extern char saveLics[myBUFSIZ];        ///< License string
161 
162 extern size_t hashEntries;             ///< Hash entries
163 
164 /**
165   Flags for program control
166  */
167 #define	FL_SAVEBASE	0x20
168 #define	FL_FRAGMENT	0x40
169 #define	FL_SHOWMATCH	0x80
170 #define	FL_NOCOPYRIGHT	0x100
171 
172 /**
173  * Names of various files/dirs created/used
174  */
175 #define	FILE_FOUND	"Found.txt"
176 #define	FILE_SCORES	"_scores"
177 #define DEBUGLOG    "/tmp/NomosDebugLog"
178 
179 
180 /**
181  *  Symbolic Boolean value for NO
182  */
183 #define	NO	0
184 /**
185  *  Symbolic Boolean value for YES
186  */
187 #define	YES	1
188 
189 /* List-sorting flags */
190 #define	UNSORTED		0
191 #define	SORT_BY_NAME		1
192 #define	SORT_BY_NAME_ICASE	2
193 #define	SORT_BY_COUNT_DSC	3
194 #define	SORT_BY_COUNT_ASC	4
195 #define	SORT_BY_ALIAS		5
196 #define	SORT_BY_BASENAME	6
197 
198 /* Interest level (licenses) */
199 #define	IL_HIGH		3
200 #define	IL_MED		2
201 #define	IL_LOW		1
202 #define	IL_NONE		0
203 #define	IL_INIT		-1
204 
205 /**
206  * license-text search results (ltsr) stuff
207  */
208 #define	LTSR_RMASK	((char) 1)	/**< True if it's been matched */
209 #define	LTSR_SMASK	((char) 2)	/**< True if it's been searched for */
210 #define	LTSR_YES	((char) 3)	  /**< Both searched, and matched */
211 #define	LTSR_NO		LTSR_SMASK	  /**< Searched but not matched */
212 
213 /**
214  * Miscellaneous strings used in various modules
215  */
216 #define STR_NOTPKG      "None (not an rpm-format package)"
217 
218 /*
219  * License-scanning limits
220  */
221 #define	_scCOMFORT	9	/**< >= 9 --> certain it's a license */
222 #define	_scINVALID	4	/**< < 4 --> probably NOT a license */
223 
224 /**
225  * LS_ = License Summaries/Strings
226  */
227 #define	LS_NONE		"None"
228 #define	LS_UNLIKELY	"LikelyNot"
229 #define	LS_NOSUM	"No_license_found"
230 #define	LS_UNCL		"UnclassifiedLicense"
231 #define	LS_NOT_PD	"NOT-public-domain"
232 #define	LS_PD_CLM	"Public-domain"
233 #define	LS_PD_CPRT	"Public-domain(C)"
234 #define	LS_PD_ONLY	"Public-domain-ref"
235 #define	LS_CPRTONLY	"Misc-Copyright"
236 #define	LS_TDMKONLY	"Trademark-ref"
237 #define	LS_LICRONLY	"License-ref"
238 #define	LS_PATRONLY	"Patent-ref"
239 
240 /*
241  * NULL values
242  */
243 #define	NULL_ITEM	(item_t *) NULL ///< NULL item
244 #define	NULL_LIST	(list_t *) NULL ///< NULL list
245 #define	NULL_FH		(fh_t *) NULL   ///< NULL fh
246 #define	NULL_CHAR	'\0'            ///< NULL character
247 #define	NULL_STR	(char *) NULL   ///< NULL string
248 
249 /**
250  * Macros needed across >1 source module
251  */
252 #define	isEOL(x)	(((x == '\n') || (x == '\r') || (x == '\v'))) ///< Check if x points to a EOL character
253 #define	IS_HUGE(x)	(x >= gl.blkUpperLimit)
254 
255 
256 
257 #define	NOMOS_TEMP	"/tmp/nomos.tempdir"
258 #define	NOMOS_TLOCK	"/tmp/nomos.tempdir/.lock.tmp," /**< CDB, This goes away. */
259 
260 
261 /**
262  * Caches memory-mapped files
263  */
264 struct mm_cache {
265     int inUse;            ///< Cache in use
266     int fd;               ///< File descriptor
267     unsigned long size;   ///< Size
268     void *mmPtr;          ///< Memory pointer
269     char label[myBUFSIZ]; ///< Label
270 };
271 
272 
273 /*
274   CDB - This is kind of tricky, the way it uses the same fields for
275   different meanings. If we had objects, we could subclass. It works
276   okay, but is just a PITA for debugging.
277  */
278 
279 /**
280    listitem item_t
281    \brief tricky data structure used for a list of 'items'
282 
283    Meanings of val fields are dependent on the particular list --
284    See #defines below for examples.
285  */
286 struct listitem {
287   int val;
288   int val2;
289   int val3;
290   char *str;		/**< primary key for list-element */
291   void *buf;		/**< alias, extra data, whatever */
292 };
293 typedef	struct listitem item_t;
294 
295 
296 /**
297    Defines for the list val fields
298  */
299 #define	seqNo		val
300 #define	foundTool	val
301 #define	refCount	val
302 #define num		val
303 #define bStart		val
304 #define iFlag		val
305 #define	ssComp		val2
306 #define	isProcessed	val2
307 #define iLevel		val2
308 #define nMatch		val2
309 #define bLen		val2
310 #define bDocLen		val3
311 #define bIndex		val3
312 #define bList		buf
313 
314 /**
315  list
316  \brief list_t type structure used to keep various lists. (e.g. there are
317  multiple lists).
318 
319  */
320 struct list {
321     char name[64]; /**< Name of the list */
322     int used; /**< Number of items found, 0 is empty list */
323     int size; /**< What size is this? (MD) */
324     int ix; /**< The index for the items below */
325     int sorted; /**< Flag to indicate how ?? (the list or the items in the
326                          list?) things are sorted: SORT_BY_NAME or
327                          SORT_BY_NAME_ICASE */
328     int desc; /**< Description */
329     item_t *items; /**< List items */
330 };
331 typedef	struct list list_t;
332 
333 /**
334  * Search string
335  */
336 struct searchString {
337     int csLen;      ///< String length
338     char *csData;   ///< String data
339 };
340 typedef struct searchString searchString_t;
341 
342 /**
343  * License specification
344  */
345 struct licenseSpec {
346     searchString_t seed;  ///< License seed
347     searchString_t text;  ///< License text
348 };
349 typedef struct licenseSpec licSpec_t;
350 
351 
352 /**
353   \brief Structure holding data truly global in that it remains consistent
354   for each file scanned.
355  */
356 struct globals {
357     char initwd[myBUFSIZ];  ///< CDB, would like to workaround/eliminate.
358     char progName[64];      ///< Program name
359     int progOpts;           ///< CLI options
360     int flags;              ///< Flags
361     int uPsize;             ///< Size
362 #ifdef	GLOBAL_DEBUG
363   int DEEBUG;
364   int MEM_DEEBUG;
365 #endif	/* GLOBAL_DEBUG */
366 #ifdef	PROC_TRACE_SWITCH
367   int ptswitch;
368 #endif	/* PROC_TRACE_SWITCH */
369     list_t sHash;           ///< Hashes
370     /* Agent-specific Things */
371     int agentPk;            ///< Agent id
372     long uploadFk;          ///< Upload id
373     int arsPk;              ///< Agent ars id
374     PGconn *pgConn;         ///< DB Connection
375     fo_dbManager *dbManager;  ///< FOSSology DB manager
376 };
377 
378 /**
379  * License match positions and license type
380  */
381 typedef struct  {
382     int start;    ///< Start position of match
383     int end;      ///< End position of match
384     int index;    ///< Enums from index (Entrynumber) in STRINGS.in
385 } MatchPositionAndType;
386 
387 /**
388  * License matches
389  */
390 typedef struct  {
391     GArray* matchPositions;   ///< Match positions
392     GArray* indexList;        ///< License indexes
393     char* licenceName;        ///< License names
394     int licenseFileId;        ///< PFile id
395 } LicenceAndMatchPositions;
396 
397 
398 
399 /**
400   curScan
401   \brief Struct that tracks state related to current file being scanned.
402  */
403 struct curScan {
404   char cwd[myBUFSIZ];      /**< CDB, Would like to workaround and eliminate. */
405   char targetDir[myBUFSIZ]; 	/**< Directory where file is */ /* check */
406   char targetFile[myBUFSIZ]; 	/**< File we're scanning (tmp file)*/ /* check */
407   char filePath[myBUFSIZ];    /**< the original file path passed in */
408   long pFileFk;            /**< [in] pfile_fk from scheduler */
409   char pFile[myBUFSIZ];       /**< [in] pfilename from scheduler */
410   char *licPara;
411   char *matchBase;
412   size_t targetLen;
413   size_t cwdLen;
414   struct stat stbuf;
415   regmatch_t regm;
416   list_t regfList;
417   list_t fLicFoundMap;
418   list_t parseList;
419   list_t offList;
420   list_t lList;
421   char compLic[myBUFSIZ];  	/**< the license(s) found, None or NotLikely.
422                                 comma separated if multiple names are found. */
423   int nLines;
424   int cliMode;                /**< boolean to indicate running from command line */
425   char *tmpLics;              /**< pointer to storage for parsed names */
426   char *licenseList[512];     /**< list of license names found, can be a single name */
427 
428   GArray* indexList; /**< List of license indexes */
429   GArray* theMatches; /**< List of matches */
430   GArray* keywordPositions; /**< List of matche positions */
431   GArray* docBufferPositionsAndOffsets;
432   int currentLicenceIndex;
433 };
434 
435 /**
436  * License pattern
437  */
438 struct license {
439     int len;      ///< Length of pattern
440     char *patt;   ///< License pattern to use
441 };
442 
443 /**
444  * License text to information
445  */
446 struct licensetext {
447     char *regex;  ///< License regex
448     char *tseed;	///< unencrypted license text
449   int nAbove;
450   int nBelow;
451   int compiled;
452   int plain;
453 };
454 typedef struct licensetext licText_t;
455 
456 /**
457  * Get regex of a license text
458  */
459 #define	_REGEX(x)	licText[x].regex
460 /**
461  * Get seed of a license text
462  */
463 #define	_SEED(x)	licText[x].tseed
464 
465 /**
466  * License scan result
467  */
468 struct scanResults {
469   int score;        ///< License match score
470   int kwbm;
471   int size;
472   int flag;         ///< Flags
473   int dataOffset;
474   char fullpath[myBUFSIZ];
475   char linkname[16];
476   char *licenses;
477   char *relpath;
478   size_t nameOffset;
479 };
480 typedef	struct scanResults scanres_t;
481 
482 /**
483  * List-based memory tags
484  */
485 #define	MTAG_UNSORTKEY	"list/str (initially-UNsorted key)"
486 #define	MTAG_SORTKEY	"list/str (initially-sorted key)"
487 #define	MTAG_LISTKEY	"list/str (sorted/unsorted key)"
488 #define	MTAG_REPLKEY	"list/str (replaced primary key)"
489 #define	MTAG_LISTBUF	"list/buf (any data)"
490 #define	MTAG_PATHBASE	"list/buf (path basename)"
491 #define	MTAG_PKGINFO	"list/buf (pkg rname/type/name/vers/lic)"
492 #define	MTAG_PKG_NV	"list/buf (pkg name/vers)"
493 #define	MTAG_MD5SUM	"list/buf (distro-arch MD5SUM)"
494 #define	MTAG_COUNTER	"list/buf integer (counter)"
495 #define	MTAG_PKGNAME	"list/buf (package-name)"
496 #define	MTAG_PKGVERS	"list/buf (package-vers)"
497 #define	MTAG_CLAIMLIC	"list/buf (claimlic copy)"
498 #define	MTAG_COMPLIC	"list/buf (pkg compLic copy)"
499 #define	MTAG_URLCOPY	"list/buf (pkg URL copy)"
500 #define	MTAG_FILELIC	"list/buf (file-license copy)"
501 #define	MTAG_FIXNAME	"list/buf (fixed-package name)"
502 /**
503  * Miscellaneous memory tags
504  */
505 #define	MTAG_SEEDTEXT	"search-seed text"
506 #define	MTAG_SRCHTEXT	"license-search text"
507 #define	MTAG_MMAPFILE	"mmap-file data"
508 #define	MTAG_MAGICDATA	"file magic description"
509 #define	MTAG_PATTRS	"pkg-attr buffer"
510 #define	MTAG_DOUBLED	"doubled (reallocated) data"
511 #define	MTAG_SEARCHBUF	"initial search-data buffer"
512 #define	MTAG_TOOSMALL	"too-small half-size buffer"
513 #define	MTAG_TEXTPARA	"paragraph text"
514 #define	MTAG_LIST	"dynamically-allocated list"
515 #define	MTAG_ENV	"environment variable"
516 #define	MTAG_SCANRES	"scan-results list"
517 
518 
519 /*
520    Functions defined in nomos.c, used in other files
521  */
522 void Bail(int exitval);
523 int optionIsSet(int val);
524 
525 /*
526   Global Declarations
527  */
528 extern struct globals gl;
529 extern struct curScan cur;
530 extern licText_t licText[];
531 extern licSpec_t licSpec[];
532 extern int schedulerMode; /* Non-zero if being run by scheduler */
533 
534 /**
535   Declarations for using the memory debug stuff
536  */
537 #ifdef	MEMORY_TRACING
538 char *memAllocTagged();
539 void memFreeTagged();
540 #define	memFree(x,y)		memFreeTagged(x, y)
541 #define	memAlloc(x,y)		memAllocTagged(x, y)
542 #else	/* NOT MEMORY_TRACING */
543 #define	memFree(x,/*notused*/y)	free(x)
544 #define	memAlloc(x,y)		calloc(x, 1)
545 #endif	/* NOT MEMORY_TRACING */
546 
547 /*
548  * Macros for timing - refer to findPhrase() for usage examples
549  */
550 /* need TIMING_DECL in the declarations section of function */
551 #define	DECL_TIMER	struct timeval bTV, eTV; float proctime
552 #define	ZERO_TIMER	memcpy((void *) &bTV, (void *) &eTV, sizeof(eTV))
553 #define	RESET_TIMER	END_TIMER; ZERO_TIMER
554 #define	START_TIMER	RECORD_TIMER(bTV)
555 #define	END_TIMER 	RECORD_TIMER(eTV) ; \
556     proctime = (float) (eTV.tv_sec - bTV.tv_sec) + \
557     ((float) (eTV.tv_usec - bTV.tv_usec) * 0.000001)
558 #define	RECORD_TIMER(x)	(void) gettimeofday(&x, (struct timezone *) NULL)
559 #define	PRINT_TIMER(x,y)	printf("%11.6f seconds: %s\n", proctime, x); \
560     if (y) { DUMP_TIMERS; }
561 #define	DUMP_TIMERS	printf("[1]: %d.%06d\n", bTV.tv_sec, bTV.tv_usec); \
562     printf("[2]: %d.%06d\n", eTV.tv_sec, eTV.tv_usec)
563 
564 
565 /*
566  * Cut-and-paste this stuff to turn on timing
567  */
568 #if	0
569 #ifdef	TIMING
570 DECL_TIMER;	/* timer declaration */
571 #endif
572 /* */
573 #ifdef	TIMING
574 START_TIMER;	/* turn on the timer */
575 #endif	/* TIMING */
576 /* */
577 #ifdef	TIMING
578 END_TIMER;	/* stop the timer */
579 PRINT_TIMER("unpack", 0);	/* ... and report */
580 START_TIMER;	/* optionally re-start timer */
581 #endif	/* TIMING */
582 #endif
583 
584 #endif /* _NOMOS_H */
585