1/** 2 * This script is an example recommender (using made up data) showing how you might modify item-item links 3 * by defining similar relations between items in a dataset and customizing the change in weighting. 4 * This example creates metadata by using the genre field as the metadata_field. The items with 5 * the same genre have it's weight cut in half in order to boost the signals of movies that do not have the same genre. 6 * This technique requires a customization of the standard GetItemItemRecommendations macro 7 */ 8import 'recommenders.pig'; 9 10 11 12%default INPUT_PATH_PURCHASES '../data/retail/purchases.json' 13%default INPUT_PATH_WISHLIST '../data/retail/wishlists.json' 14%default INPUT_PATH_INVENTORY '../data/retail/inventory.json' 15%default OUTPUT_PATH '../data/retail/out/modify_item_item' 16 17 18/******** Custom GetItemItemRecommnedations *********/ 19define recsys__GetItemItemRecommendations_ModifyCustom(user_item_signals, metadata) returns item_item_recs { 20 21 -- Convert user_item_signals to an item_item_graph 22 ii_links_raw, item_weights = recsys__BuildItemItemGraph( 23 $user_item_signals, 24 $LOGISTIC_PARAM, 25 $MIN_LINK_WEIGHT, 26 $MAX_LINKS_PER_USER 27 ); 28 -- NOTE this function is added in order to combine metadata with item-item links 29 -- See macro for more detailed explination 30 ii_links_metadata = recsys__AddMetadataToItemItemLinks( 31 ii_links_raw, 32 $metadata 33 ); 34 35 /********* Custom Code starts here ********/ 36 37 --The code here should adjust the weights based on an item-item link and the equality of metadata. 38 -- In this case, if the metadata is the same, the weight is reduced. Otherwise the weight is left alone. 39 ii_links_adjusted = foreach ii_links_metadata generate item_A, item_B, 40 -- the amount of weight adjusted is dependant on the domain of data and what is expected 41 -- It is always best to adjust the weight by multiplying it by a factor rather than addition with a constant 42 (metadata_B == metadata_A ? (weight * 0.5): weight) as weight; 43 44 45 /******** Custom Code stops here *********/ 46 47 -- remove negative numbers just incase 48 ii_links_adjusted_filt = foreach ii_links_adjusted generate item_A, item_B, 49 (weight <= 0 ? 0: weight) as weight; 50 -- Adjust the weights of the graph to improve recommendations. 51 ii_links = recsys__AdjustItemItemGraphWeight( 52 ii_links_adjusted_filt, 53 item_weights, 54 $BAYESIAN_PRIOR 55 ); 56 57 -- Use the item-item graph to create item-item recommendations. 58 $item_item_recs = recsys__BuildItemItemRecommendationsFromGraph( 59 ii_links, 60 $NUM_RECS_PER_ITEM, 61 $NUM_RECS_PER_ITEM 62 ); 63}; 64 65 66/******* Load Data **********/ 67 68--Get purchase signals 69purchase_input = load '$INPUT_PATH_PURCHASES' using org.apache.pig.piggybank.storage.JsonLoader( 70 'row_id: int, 71 movie_id: chararray, 72 movie_name: chararray, 73 user_id: chararray, 74 purchase_price: int'); 75 76--Get wishlist signals 77wishlist_input = load '$INPUT_PATH_WISHLIST' using org.apache.pig.piggybank.storage.JsonLoader( 78 'row_id: int, 79 movie_id: chararray, 80 movie_name: chararray, 81 user_id: chararray'); 82 83 84/******* Convert Data to Signals **********/ 85 86-- Start with choosing 1 as max weight for a signal. 87purchase_signals = foreach purchase_input generate 88 user_id as user, 89 movie_name as item, 90 1.0 as weight; 91 92 93-- Start with choosing 0.5 as weight for wishlist items because that is a weaker signal than 94-- purchasing an item. 95wishlist_signals = foreach wishlist_input generate 96 user_id as user, 97 movie_name as item, 98 0.5 as weight; 99 100user_signals = union purchase_signals, wishlist_signals; 101 102 103/******** Changes for Modifying item-item links ******/ 104inventory_input = load '$INPUT_PATH_INVENTORY' using org.apache.pig.piggybank.storage.JsonLoader( 105 'movie_title: chararray, 106 genres: bag{tuple(content:chararray)}'); 107 108 109metadata = foreach inventory_input generate 110 FLATTEN(genres) as metadata_field, 111 movie_title as item; 112-- requires the macro to be written seperately 113 --NOTE this macro is defined within this file for clarity 114item_item_recs = recsys__GetItemItemRecommendations_ModifyCustom(user_signals, metadata); 115/******* No more changes ********/ 116 117 118user_item_recs = recsys__GetUserItemRecommendations(user_signals, item_item_recs); 119 120--Completely unrelated code stuck in the middle 121data = LOAD 's3n://my-s3-bucket/path/to/responses' 122 USING org.apache.pig.piggybank.storage.JsonLoader(); 123responses = FOREACH data GENERATE object#'response' AS response: map[]; 124out = FOREACH responses 125 GENERATE response#'id' AS id: int, response#'thread' AS thread: chararray, 126 response#'comments' AS comments: {t: (comment: chararray)}; 127STORE out INTO 's3n://path/to/output' USING PigStorage('|'); 128 129 130/******* Store recommendations **********/ 131 132-- If your output folder exists already, hadoop will refuse to write data to it. 133 134rmf $OUTPUT_PATH/item_item_recs; 135rmf $OUTPUT_PATH/user_item_recs; 136 137store item_item_recs into '$OUTPUT_PATH/item_item_recs' using PigStorage(); 138store user_item_recs into '$OUTPUT_PATH/user_item_recs' using PigStorage(); 139 140-- STORE the item_item_recs into dynamo 141STORE item_item_recs 142 INTO '$OUTPUT_PATH/unused-ii-table-data' 143USING com.mortardata.pig.storage.DynamoDBStorage('$II_TABLE', '$AWS_ACCESS_KEY_ID', '$AWS_SECRET_ACCESS_KEY'); 144 145-- STORE the user_item_recs into dynamo 146STORE user_item_recs 147 INTO '$OUTPUT_PATH/unused-ui-table-data' 148USING com.mortardata.pig.storage.DynamoDBStorage('$UI_TABLE', '$AWS_ACCESS_KEY_ID', '$AWS_SECRET_ACCESS_KEY'); 149