1 #ifndef FUZZY_H
2 #define FUZZY_H
3 
4 /*
5  * Copyright (C) ManTech International Corporation 2010
6  * Copyright (C) Kyrus 2012
7  * Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de>
8  *
9  * $Id$
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
24  *
25  * Earlier versions of this code can be found at:
26  *     http://ssdeep.sf.net/
27  */
28 
29 /// \mainpage
30 /// This is the documentation for the fuzzy hashing API from ssdeep.
31 ///
32 /// There is a complete function reference in fuzzy.h.
33 ///
34 /// The most recent version of this documentation can be found
35 /// at http://ssdeep.sourceforge.net/.
36 ///
37 /// \copydoc fuzzy.h
38 ///
39 /// \version 3.0
40 ///
41 /// \author Jesse Kornblum, research@jessekornblum.com
42 /// \author Helmut Grohne, helmut@subdivi.de
43 
44 /// \file fuzzy.h
45 /// \brief
46 /// These functions allow a programmer to compute the fuzzy hashes
47 /// (also called the context-triggered piecewise hashes) of
48 /// \link fuzzy_hash_buf() a buffer
49 /// of text @endlink,
50 /// \link fuzzy_hash_filename() the contents of a file on the disk @endlink,
51 /// and
52 /// @link fuzzy_hash_file() the contents of
53 /// an open file handle @endlink .
54 /// There is also a function to
55 /// @link fuzzy_compare() compute the
56 /// similarity between any two fuzzy signatures @endlink.
57 
58 
59 #include <stdint.h>
60 #include <stdio.h>
61 
62 #ifdef __cplusplus
63 extern "C" {
64 #endif
65 
66 /**
67  * @brief fuzzy_digest flag indicating to eliminate sequences of more than
68  *        three identical characters
69  */
70 #define FUZZY_FLAG_ELIMSEQ 0x1u
71 /**
72  * @brief fuzzy_digest flag indicating not to truncate the second part to
73  *        SPAMSUM_LENGTH/2 characters.
74  */
75 #define FUZZY_FLAG_NOTRUNC 0x2u
76 
77 struct fuzzy_state;
78 
79 /**
80  * @brief Construct a fuzzy_state object and return it.
81  *
82  * To use it call fuzzy_update and fuzzy_digest on it. It must be disposed
83  * with fuzzy_free.
84  * @return the constructed fuzzy_state or NULL on failure
85  */
86 extern /*@only@*/ /*@null@*/ struct fuzzy_state *fuzzy_new(void);
87 
88 /**
89  * @brief Create a copy of a fuzzy_state object and return it.
90  *
91  * It can be used with fuzzy_update and fuzzy_digest independently of
92  * the original. It must be disposed with fuzzy_free like the original
93  * has to be cleared in this way.
94  * @param state The fuzzy state
95  * @return the cloned fuzzy_state or NULL on failure
96  */
97 extern /*@only@*/ /*@null@*/ struct fuzzy_state *fuzzy_clone(const struct fuzzy_state *state);
98 
99 /**
100  * @brief Set fixed length of input
101  *
102  * If we know the file size to compute fuzzy digest, we can boost
103  * computation by restricting range of blocksize.
104  * @param state The fuzzy state
105  * @param total_fixed_length Total length of the data to generate digest
106  * @return 0 on success or -1 on failure
107  */
108 extern int fuzzy_set_total_input_length(struct fuzzy_state *state, uint_least64_t total_fixed_length);
109 
110 /**
111  * @brief Feed the data contained in the given buffer to the state.
112  *
113  * When an error occurs, the state is undefined. In that case it must not be
114  * passed to any function besides fuzzy_free.
115  * @param state The fuzzy state
116  * @param buffer The data to be hashes
117  * @param buffer_size The length of the given buffer
118  * @return zero on success, non-zero on error
119  */
120 extern int fuzzy_update(struct fuzzy_state *state,
121 			const unsigned char *buffer,
122 			size_t buffer_size);
123 
124 /**
125  * @brief Obtain the fuzzy hash from the state.
126  *
127  * This operation does not change the state at all. It reports the hash for the
128  * concatenation of the data previously fed using fuzzy_update.
129  * @param state The fuzzy state
130  * @param result Where the fuzzy hash is stored. This variable
131  * must be allocated to hold at least FUZZY_MAX_RESULT bytes.
132  * @param flags is a bitwise or of FUZZY_FLAG_* macros. The absence of flags is
133  * represented by a zero.
134  * @return zero on success, non-zero on error
135  */
136 extern int fuzzy_digest(const struct fuzzy_state *state,
137 			/*@out@*/ char *result,
138 			unsigned int flags);
139 
140 /**
141  * @brief Dispose a fuzzy state.
142  * @param state The fuzzy state to dispose
143  */
144 extern void fuzzy_free(/*@only@*/ struct fuzzy_state *state);
145 
146 /**
147  * @brief Compute the fuzzy hash of a buffer
148  *
149  * The computes the fuzzy hash of the first buf_len bytes of the buffer.
150  * It is the caller's responsibility to append the filename,
151  * if any, to result after computation.
152  * @param buf The data to be fuzzy hashed
153  * @param buf_len The length of the data being hashed
154  * @param result Where the fuzzy hash of buf is stored. This variable
155  * must be allocated to hold at least FUZZY_MAX_RESULT bytes.
156  * @return Returns zero on success, non-zero on error.
157  */
158 extern int fuzzy_hash_buf(const unsigned char *buf,
159 			  uint32_t buf_len,
160 			  /*@out@*/ char *result);
161 
162 /**
163  * @brief Compute the fuzzy hash of a file using an open handle
164  *
165  * Computes the fuzzy hash of the contents of the open file, starting
166  * at the beginning of the file. When finished, the file pointer is
167  * returned to its original position. If an error occurs, the file
168  * pointer's value is undefined.
169  * It is the callers's responsibility to append the filename
170  * to the result after computation.
171  * @param handle Open handle to the file to be hashed
172  * @param result Where the fuzzy hash of the file is stored. This
173  * variable must be allocated to hold at least FUZZY_MAX_RESULT bytes.
174  * @return Returns zero on success, non-zero on error
175  */
176 extern int fuzzy_hash_file(FILE *handle, /*@out@*/ char *result);
177 
178 /**
179  * @brief Compute the fuzzy hash of a stream using an open handle
180  *
181  * Computes the fuzzy hash of the contents of the open stream, starting at the
182  * current file position until reaching EOF. Unlike fuzzy_hash_file the stream
183  * is never seeked. If an error occurs, the result as well as the file position
184  * are undefined.
185  * It is the callers's responsibility to append the filename
186  * to the result after computation.
187  * @param handle Open handle to the stream to be hashed
188  * @param result Where the fuzzy hash of the file is stored. This
189  * variable must be allocated to hold at least FUZZY_MAX_RESULT bytes.
190  * @return Returns zero on success, non-zero on error
191  */
192 extern int fuzzy_hash_stream(FILE *handle, /*@out@*/ char *result);
193 
194 /**
195  * @brief Compute the fuzzy hash of a file
196  *
197  * Opens, reads, and hashes the contents of the file 'filename'
198  * The result must be allocated to hold FUZZY_MAX_RESULT characters.
199  * It is the caller's responsibility to append the filename
200  * to the result after computation.
201  * @param filename The file to be hashed
202  * @param result Where the fuzzy hash of the file is stored. This
203  * variable must be allocated to hold at least FUZZY_MAX_RESULT bytes.
204  * @return Returns zero on success, non-zero on error.
205  */
206 extern int fuzzy_hash_filename(const char *filename, /*@out@*/ char * result);
207 
208 /// Computes the match score between two fuzzy hash signatures.
209 /// @return Returns a value from zero to 100 indicating the
210 /// match score of the
211 /// two signatures. A match score of zero indicates the signatures
212 /// did not match. When an error occurs, such as if one of the
213 /// inputs is NULL, returns -1.
214 extern int fuzzy_compare(const char *sig1, const char *sig2);
215 
216 /** Length of an individual fuzzy hash signature component. */
217 #define SPAMSUM_LENGTH 64
218 
219 /** The longest possible length for a fuzzy hash signature
220  * (without the filename) */
221 #define FUZZY_MAX_RESULT (2 * SPAMSUM_LENGTH + 20)
222 
223 #ifdef __cplusplus
224 }
225 #endif
226 
227 #endif
228