1 #pragma once
2 
3 #ifdef SIMDJSON_COMPETITION_RAPIDJSON
4 
5 #include "partial_tweets.h"
6 #include <string.h>
7 #include <fstream>
8 
9 namespace partial_tweets {
10 
11 using namespace rapidjson;
12 
13 struct rapidjson_sax {
14     using StringType=std::string_view;
15 
16     // 8 keys to parse for each tweet (in order of appearance): "created_at", "id", "text", "in_reply_status_id", "id"(user),
17     // "screen_name"(user), "retweet_count" and "favorite_count".
18     // Assume that the first valid key encountered will be the correct key to parse.
19     // Assume that each tweet/retweet start with a key "metadata" and has a key "retweeted" towards the end
20     // The previous assumption will be used to check for the beginning of a new tweet and the end of a retweet
21     struct Handler {
22         enum state {    // Bitset to store state of search
23             key_date = (1<<0),
24             key_id = (1<<1),
25             key_text = (1<<2),
26             key_reply = (1<<3),
27             key_userid = (1<<4),
28             key_screenname = (1<<5),
29             key_rt = (1<<6),
30             key_fav = (1<<7),
31             found_date = (1<<8),
32             found_id = (1<<9),
33             found_text = (1<<10),
34             found_reply = (1<<11),
35             found_userid = (1<<12),
36             found_screenname = (1<<13),
37             found_rt = (1<<14),
38             found_fav = (1<<15)
39         };
40         int values = state::key_date;
41         bool userobject_id = false; // If in a user object (to find user.id)
42         bool userobject_screen_name = false;    // If in a user object (to find user.screen_name)
43         bool inretweet = false; // If in a retweet (all keys irrelevant in retweet object)
44         // Fields to store partial tweet info
45         uint64_t user_id;
46         uint64_t id;
47         uint64_t rt;
48         uint64_t fav;
49         uint64_t reply_status;
50         std::string_view screen_name;
51         std::string_view date;
52         std::string_view text;
53         std::vector<tweet<std::string_view>>& result;
54 
Handlerrapidjson_sax::Handler55         Handler(std::vector<tweet<std::string_view>> &r) : result(r) { }
56 
Keyrapidjson_sax::Handler57         bool Key(const char* key, SizeType length, bool copy) {
58             if (!inretweet) {   // If not in a retweet object, find relevant keys
59                 if ((length == 16) && (memcmp(key,"retweeted_status",16) == 0)) { inretweet = true; }   // Check if entering retweet
60                 else if ((length == 8) && (memcmp(key,"metadata",8) == 0)) { values = 0; }  // Reset
61                 // Check if key has been found and if key matches a valid key
62                 else if (!(values & found_date) && (length == 10) && (memcmp(key,"created_at",10) == 0)) { values |= (key_date); }
63                 // Must also check if not in a user object
64                 else if (!(values & found_id) && !userobject_id && (length == 2) && (memcmp(key,"id",2) == 0)) { values |= (key_id); }
65                 else if (!(values & found_text) && (length == 4) && (memcmp(key,"text",4) == 0)) { values |= (key_text); }
66                 else if (!(values & found_reply) && (length == 21) && (memcmp(key,"in_reply_to_status_id",21) == 0)) { values |= (key_reply); }
67                 // Check if entering user object
68                 else if ((length == 4) && (memcmp(key,"user",4) == 0)) { userobject_id = userobject_screen_name = true; }
69                 // Must also check if in a user object
70                 else if (!(values & found_userid) && userobject_id && (length == 2) && (memcmp(key,"id",2) == 0)) { values |= (key_userid); }
71                 // Must also check if in a user object
72                 else if (!(values & found_screenname) && userobject_screen_name && (length == 11) && (memcmp(key,"screen_name",11) == 0)) { values |= (key_screenname); }
73                 else if (!(values & found_rt) && (length == 13) && (memcmp(key,"retweet_count",13) == 0)) { values |= (key_rt); }
74                 else if (!(values & found_fav) && (length == 14) && (memcmp(key,"favorite_count",14) == 0)) { values |= (key_fav); }
75             }
76             else if ((length == 9) && (memcmp(key,"retweeted",9) == 0)) { inretweet = false; }  // Check if end of retweet
77             return true;
78         }
Uintrapidjson_sax::Handler79         bool Uint(unsigned i) {
80             if (values & key_userid && !(values & found_userid)) {    // user.id
81                 user_id = i;
82                 userobject_id = false;
83                 values &= ~(key_userid);
84                 values |= (found_userid);
85             }
86             else if (values & key_rt && !(values & found_rt)) {   // retweet_count
87                 rt = i;
88                 values &= ~(key_rt);
89                 values |= (found_rt);
90             }
91             else if (values & key_fav && !(values & found_fav)) {   // favorite_count
92                 fav = i;
93                 values &= ~(key_fav);
94                 values |= (found_fav);
95                 // Assume that this is last key required, so add the partial_tweet to result
96                 result.emplace_back(partial_tweets::tweet<std::string_view>{
97                 date,id,text,reply_status,{user_id,screen_name},rt,fav});
98             }
99             return true;
100         }
Uint64rapidjson_sax::Handler101         bool Uint64(uint64_t i) {
102             if (values & key_id && !(values & found_id)) {    // id
103                 id = i;
104                 values &= ~(key_id);
105                 values |= (found_id);
106             }
107             else if (values & key_reply && !(values & found_reply)) {   // in_reply_status_id
108                 reply_status = i;
109                 values &= ~(key_reply);
110                 values |= (found_reply);
111             }
112             return true;
113         }
Stringrapidjson_sax::Handler114         bool String(const char* str, SizeType length, bool copy) {
115             if (values & key_date && !(values & found_date)) {   //  created_at
116                 date = {str,length};
117                 values &= ~(key_date);
118                 values |= (found_date);
119             }
120             else if (values & key_text && !(values & found_text)) {   // text
121                 text = {str,length};
122                 values &= ~(key_text);
123                 values |= (found_text);
124             }
125             else if (values & key_screenname && !(values & found_screenname)) {    // user.screen_name
126                 screen_name = {str,length};
127                 userobject_screen_name = false;
128                 values &= ~(key_screenname);
129                 values |= (found_screenname);
130             }
131             return true;
132         }
Nullrapidjson_sax::Handler133         bool Null() {
134             if (values & key_reply && !(values & found_reply)) {    // in_reply_status (null case)
135                 reply_status = 0;
136                 values &= ~(key_reply);
137                 values |= (found_reply);
138             }
139             return true;
140         }
141         // Irrelevant events
Boolrapidjson_sax::Handler142         bool Bool(bool b) { return true; }
Doublerapidjson_sax::Handler143         bool Double(double d) { return true; }
Intrapidjson_sax::Handler144         bool Int(int i) { return true; }
Int64rapidjson_sax::Handler145         bool Int64(int64_t i) { return true; }
RawNumberrapidjson_sax::Handler146         bool RawNumber(const char* str, SizeType length, bool copy) { return true; }
StartObjectrapidjson_sax::Handler147         bool StartObject() { return true; }
EndObjectrapidjson_sax::Handler148         bool EndObject(SizeType memberCount) { return true; }
StartArrayrapidjson_sax::Handler149         bool StartArray() { return true; }
EndArrayrapidjson_sax::Handler150         bool EndArray(SizeType elementCount) { return true; }
151     }; // handler
152 
runrapidjson_sax153     bool run(simdjson::padded_string &json, std::vector<tweet<std::string_view>> &result) {
154         Reader reader;
155         Handler handler(result);
156         InsituStringStream ss(json.data());
157         reader.Parse<kParseInsituFlag | kParseValidateEncodingFlag | kParseFullPrecisionFlag>(ss,handler);
158         return true;
159     }
160 
161 }; // rapid_jason_sax
162 BENCHMARK_TEMPLATE(partial_tweets, rapidjson_sax)->UseManualTime();
163 } // namespace partial_tweets
164 
165 #endif // SIMDJSON_COMPETITION_RAPIDJSON