1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 #include <vdb/extern.h>
27 
28 #include <vdb/xform.h>
29 #include <vdb/schema.h>
30 #include <klib/data-buffer.h>
31 #include <klib/rc.h>
32 #include <sysalloc.h>
33 
34 #include <string.h>
35 #include <assert.h>
36 
37 
38 typedef uint16_t text_token;
39 enum { tt_id, tt_start, tt_len };
40 
41 
42 /* extract_token
43  *  extract a textual token from an input string
44  *
45  *  "idx" [ CONST ] - a zero-based index of the token
46  *  if value < row_len ( tok ), then the substring of
47  *  indexed token is returned. otherwise, returns empty.
48  *
49  *  "str" [ DATA ] - input text. type must be compatible with
50  *  output production, meaning types must be same, or ascii input
51  *  with utf8 output.
52  *
53  *  "tok" [ DATA ] - results of tokenizing "str"
54  */
55 static
extract_token(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])56 rc_t CC extract_token ( void *data, const VXformInfo *info, int64_t row_id,
57     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
58 {
59     rc_t rc;
60     KDataBuffer *dst = rslt -> data;
61     uint32_t idx = ( uint32_t ) ( size_t ) data;
62 
63     rslt -> elem_count = 0;
64     if ( ( uint64_t ) idx >= argv [ 1 ] . u . data . elem_count )
65     {
66         /* issue empty string */
67         rc = KDataBufferResize ( dst, 0 );
68     }
69     else
70     {
71         size_t sub_bytes, elem_bytes = ( size_t ) ( argv [ 0 ] . u . data . elem_bits >> 3 );
72         const char *str = argv [ 0 ] . u . data . base;
73         const text_token *tok = argv [ 1 ] . u . data . base;
74 
75         str += argv [ 0 ] . u . data . first_elem * elem_bytes;
76         tok += ( argv [ 1 ] . u . data . first_elem + idx ) * 3;
77         sub_bytes = elem_bytes * tok [ tt_len ];
78 
79         /* set output buffer size */
80         KDataBufferCast ( dst, dst, rslt -> elem_bits, true );
81         rc = KDataBufferResize ( dst, tok [ tt_len ] );
82         if ( rc == 0 )
83         {
84             /* copy substring */
85             rslt -> elem_count = tok [ tt_len ];
86             memmove ( dst -> base, & str [ elem_bytes * tok [ tt_start ] ], sub_bytes );
87         }
88     }
89 
90     return rc;
91 }
92 
93 VTRANSFACT_IMPL ( vdb_extract_token, 1, 0, 0 ) ( const void *self, const VXfactInfo *info,
94     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
95 {
96     /* ensure that the type of input is either identical to
97        or compatible with the output */
98     if ( ! VTypedeclToTypedecl ( & dp -> argv [ 0 ] . fd . td, info -> schema, & info -> fdesc . fd . td, NULL, NULL ) )
99         return RC ( rcXF, rcFunction, rcConstructing, rcType, rcInconsistent );
100 
101     rslt -> self = ( void* ) ( size_t ) cp -> argv [ 0 ] . data . u32 [ 0 ];
102     rslt -> variant = vftRow;
103     rslt -> u . rf = extract_token;
104     return 0;
105 }
106