1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #include <vdb/extern.h>
27
28 #include <vdb/xform.h>
29 #include <vdb/schema.h>
30 #include <klib/data-buffer.h>
31 #include <klib/rc.h>
32 #include <sysalloc.h>
33
34 #include <string.h>
35 #include <assert.h>
36
37
38 typedef uint16_t text_token;
39 enum { tt_id, tt_start, tt_len };
40
41
42 /* extract_token
43 * extract a textual token from an input string
44 *
45 * "idx" [ CONST ] - a zero-based index of the token
46 * if value < row_len ( tok ), then the substring of
47 * indexed token is returned. otherwise, returns empty.
48 *
49 * "str" [ DATA ] - input text. type must be compatible with
50 * output production, meaning types must be same, or ascii input
51 * with utf8 output.
52 *
53 * "tok" [ DATA ] - results of tokenizing "str"
54 */
55 static
extract_token(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])56 rc_t CC extract_token ( void *data, const VXformInfo *info, int64_t row_id,
57 VRowResult *rslt, uint32_t argc, const VRowData argv [] )
58 {
59 rc_t rc;
60 KDataBuffer *dst = rslt -> data;
61 uint32_t idx = ( uint32_t ) ( size_t ) data;
62
63 rslt -> elem_count = 0;
64 if ( ( uint64_t ) idx >= argv [ 1 ] . u . data . elem_count )
65 {
66 /* issue empty string */
67 rc = KDataBufferResize ( dst, 0 );
68 }
69 else
70 {
71 size_t sub_bytes, elem_bytes = ( size_t ) ( argv [ 0 ] . u . data . elem_bits >> 3 );
72 const char *str = argv [ 0 ] . u . data . base;
73 const text_token *tok = argv [ 1 ] . u . data . base;
74
75 str += argv [ 0 ] . u . data . first_elem * elem_bytes;
76 tok += ( argv [ 1 ] . u . data . first_elem + idx ) * 3;
77 sub_bytes = elem_bytes * tok [ tt_len ];
78
79 /* set output buffer size */
80 KDataBufferCast ( dst, dst, rslt -> elem_bits, true );
81 rc = KDataBufferResize ( dst, tok [ tt_len ] );
82 if ( rc == 0 )
83 {
84 /* copy substring */
85 rslt -> elem_count = tok [ tt_len ];
86 memmove ( dst -> base, & str [ elem_bytes * tok [ tt_start ] ], sub_bytes );
87 }
88 }
89
90 return rc;
91 }
92
93 VTRANSFACT_IMPL ( vdb_extract_token, 1, 0, 0 ) ( const void *self, const VXfactInfo *info,
94 VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
95 {
96 /* ensure that the type of input is either identical to
97 or compatible with the output */
98 if ( ! VTypedeclToTypedecl ( & dp -> argv [ 0 ] . fd . td, info -> schema, & info -> fdesc . fd . td, NULL, NULL ) )
99 return RC ( rcXF, rcFunction, rcConstructing, rcType, rcInconsistent );
100
101 rslt -> self = ( void* ) ( size_t ) cp -> argv [ 0 ] . data . u32 [ 0 ];
102 rslt -> variant = vftRow;
103 rslt -> u . rf = extract_token;
104 return 0;
105 }
106