1 #pragma once 2 3 #ifdef SIMDJSON_COMPETITION_RAPIDJSON 4 5 #include "partial_tweets.h" 6 #include <string.h> 7 #include <fstream> 8 9 namespace partial_tweets { 10 11 using namespace rapidjson; 12 13 struct rapidjson_sax { 14 using StringType=std::string_view; 15 16 // 8 keys to parse for each tweet (in order of appearance): "created_at", "id", "text", "in_reply_status_id", "id"(user), 17 // "screen_name"(user), "retweet_count" and "favorite_count". 18 // Assume that the first valid key encountered will be the correct key to parse. 19 // Assume that each tweet/retweet start with a key "metadata" and has a key "retweeted" towards the end 20 // The previous assumption will be used to check for the beginning of a new tweet and the end of a retweet 21 struct Handler { 22 enum state { // Bitset to store state of search 23 key_date = (1<<0), 24 key_id = (1<<1), 25 key_text = (1<<2), 26 key_reply = (1<<3), 27 key_userid = (1<<4), 28 key_screenname = (1<<5), 29 key_rt = (1<<6), 30 key_fav = (1<<7), 31 found_date = (1<<8), 32 found_id = (1<<9), 33 found_text = (1<<10), 34 found_reply = (1<<11), 35 found_userid = (1<<12), 36 found_screenname = (1<<13), 37 found_rt = (1<<14), 38 found_fav = (1<<15) 39 }; 40 int values = state::key_date; 41 bool userobject_id = false; // If in a user object (to find user.id) 42 bool userobject_screen_name = false; // If in a user object (to find user.screen_name) 43 bool inretweet = false; // If in a retweet (all keys irrelevant in retweet object) 44 // Fields to store partial tweet info 45 uint64_t user_id; 46 uint64_t id; 47 uint64_t rt; 48 uint64_t fav; 49 uint64_t reply_status; 50 std::string_view screen_name; 51 std::string_view date; 52 std::string_view text; 53 std::vector<tweet<std::string_view>>& result; 54 Handlerrapidjson_sax::Handler55 Handler(std::vector<tweet<std::string_view>> &r) : result(r) { } 56 Keyrapidjson_sax::Handler57 bool Key(const char* key, SizeType length, bool copy) { 58 if (!inretweet) { // If not in a retweet object, find relevant keys 59 if ((length == 16) && (memcmp(key,"retweeted_status",16) == 0)) { inretweet = true; } // Check if entering retweet 60 else if ((length == 8) && (memcmp(key,"metadata",8) == 0)) { values = 0; } // Reset 61 // Check if key has been found and if key matches a valid key 62 else if (!(values & found_date) && (length == 10) && (memcmp(key,"created_at",10) == 0)) { values |= (key_date); } 63 // Must also check if not in a user object 64 else if (!(values & found_id) && !userobject_id && (length == 2) && (memcmp(key,"id",2) == 0)) { values |= (key_id); } 65 else if (!(values & found_text) && (length == 4) && (memcmp(key,"text",4) == 0)) { values |= (key_text); } 66 else if (!(values & found_reply) && (length == 21) && (memcmp(key,"in_reply_to_status_id",21) == 0)) { values |= (key_reply); } 67 // Check if entering user object 68 else if ((length == 4) && (memcmp(key,"user",4) == 0)) { userobject_id = userobject_screen_name = true; } 69 // Must also check if in a user object 70 else if (!(values & found_userid) && userobject_id && (length == 2) && (memcmp(key,"id",2) == 0)) { values |= (key_userid); } 71 // Must also check if in a user object 72 else if (!(values & found_screenname) && userobject_screen_name && (length == 11) && (memcmp(key,"screen_name",11) == 0)) { values |= (key_screenname); } 73 else if (!(values & found_rt) && (length == 13) && (memcmp(key,"retweet_count",13) == 0)) { values |= (key_rt); } 74 else if (!(values & found_fav) && (length == 14) && (memcmp(key,"favorite_count",14) == 0)) { values |= (key_fav); } 75 } 76 else if ((length == 9) && (memcmp(key,"retweeted",9) == 0)) { inretweet = false; } // Check if end of retweet 77 return true; 78 } Uintrapidjson_sax::Handler79 bool Uint(unsigned i) { 80 if (values & key_userid && !(values & found_userid)) { // user.id 81 user_id = i; 82 userobject_id = false; 83 values &= ~(key_userid); 84 values |= (found_userid); 85 } 86 else if (values & key_rt && !(values & found_rt)) { // retweet_count 87 rt = i; 88 values &= ~(key_rt); 89 values |= (found_rt); 90 } 91 else if (values & key_fav && !(values & found_fav)) { // favorite_count 92 fav = i; 93 values &= ~(key_fav); 94 values |= (found_fav); 95 // Assume that this is last key required, so add the partial_tweet to result 96 result.emplace_back(partial_tweets::tweet<std::string_view>{ 97 date,id,text,reply_status,{user_id,screen_name},rt,fav}); 98 } 99 return true; 100 } Uint64rapidjson_sax::Handler101 bool Uint64(uint64_t i) { 102 if (values & key_id && !(values & found_id)) { // id 103 id = i; 104 values &= ~(key_id); 105 values |= (found_id); 106 } 107 else if (values & key_reply && !(values & found_reply)) { // in_reply_status_id 108 reply_status = i; 109 values &= ~(key_reply); 110 values |= (found_reply); 111 } 112 return true; 113 } Stringrapidjson_sax::Handler114 bool String(const char* str, SizeType length, bool copy) { 115 if (values & key_date && !(values & found_date)) { // created_at 116 date = {str,length}; 117 values &= ~(key_date); 118 values |= (found_date); 119 } 120 else if (values & key_text && !(values & found_text)) { // text 121 text = {str,length}; 122 values &= ~(key_text); 123 values |= (found_text); 124 } 125 else if (values & key_screenname && !(values & found_screenname)) { // user.screen_name 126 screen_name = {str,length}; 127 userobject_screen_name = false; 128 values &= ~(key_screenname); 129 values |= (found_screenname); 130 } 131 return true; 132 } Nullrapidjson_sax::Handler133 bool Null() { 134 if (values & key_reply && !(values & found_reply)) { // in_reply_status (null case) 135 reply_status = 0; 136 values &= ~(key_reply); 137 values |= (found_reply); 138 } 139 return true; 140 } 141 // Irrelevant events Boolrapidjson_sax::Handler142 bool Bool(bool b) { return true; } Doublerapidjson_sax::Handler143 bool Double(double d) { return true; } Intrapidjson_sax::Handler144 bool Int(int i) { return true; } Int64rapidjson_sax::Handler145 bool Int64(int64_t i) { return true; } RawNumberrapidjson_sax::Handler146 bool RawNumber(const char* str, SizeType length, bool copy) { return true; } StartObjectrapidjson_sax::Handler147 bool StartObject() { return true; } EndObjectrapidjson_sax::Handler148 bool EndObject(SizeType memberCount) { return true; } StartArrayrapidjson_sax::Handler149 bool StartArray() { return true; } EndArrayrapidjson_sax::Handler150 bool EndArray(SizeType elementCount) { return true; } 151 }; // handler 152 runrapidjson_sax153 bool run(simdjson::padded_string &json, std::vector<tweet<std::string_view>> &result) { 154 Reader reader; 155 Handler handler(result); 156 InsituStringStream ss(json.data()); 157 reader.Parse<kParseInsituFlag | kParseValidateEncodingFlag | kParseFullPrecisionFlag>(ss,handler); 158 return true; 159 } 160 161 }; // rapid_jason_sax 162 BENCHMARK_TEMPLATE(partial_tweets, rapidjson_sax)->UseManualTime(); 163 } // namespace partial_tweets 164 165 #endif // SIMDJSON_COMPETITION_RAPIDJSON