1/**
2 *  This script is an example recommender (using made up data) showing how you might modify item-item links
3 *  by defining similar relations between items in a dataset and customizing the change in weighting.
4 *  This example creates metadata by using the genre field as the metadata_field.  The items with
5 *  the same genre have it's weight cut in half in order to boost the signals of movies that do not have the same genre.
6 *  This technique requires a customization of the standard GetItemItemRecommendations macro
7 */
8import 'recommenders.pig';
9
10
11
12%default INPUT_PATH_PURCHASES '../data/retail/purchases.json'
13%default INPUT_PATH_WISHLIST '../data/retail/wishlists.json'
14%default INPUT_PATH_INVENTORY '../data/retail/inventory.json'
15%default OUTPUT_PATH '../data/retail/out/modify_item_item'
16
17
18/******** Custom GetItemItemRecommnedations *********/
19define recsys__GetItemItemRecommendations_ModifyCustom(user_item_signals, metadata) returns item_item_recs {
20
21    -- Convert user_item_signals to an item_item_graph
22    ii_links_raw, item_weights   =   recsys__BuildItemItemGraph(
23                                       $user_item_signals,
24                                       $LOGISTIC_PARAM,
25                                       $MIN_LINK_WEIGHT,
26                                       $MAX_LINKS_PER_USER
27                                     );
28    -- NOTE this function is added in order to combine metadata with item-item links
29        -- See macro for more detailed explination
30    ii_links_metadata           =   recsys__AddMetadataToItemItemLinks(
31                                        ii_links_raw,
32                                        $metadata
33                                    );
34
35    /********* Custom Code starts here ********/
36
37    --The code here should adjust the weights based on an item-item link and the equality of metadata.
38    -- In this case, if the metadata is the same, the weight is reduced.  Otherwise the weight is left alone.
39    ii_links_adjusted           =  foreach ii_links_metadata generate item_A, item_B,
40                                        -- the amount of weight adjusted is dependant on the domain of data and what is expected
41                                        -- It is always best to adjust the weight by multiplying it by a factor rather than addition with a constant
42                                        (metadata_B == metadata_A ? (weight * 0.5): weight) as weight;
43
44
45    /******** Custom Code stops here *********/
46
47    -- remove negative numbers just incase
48    ii_links_adjusted_filt = foreach ii_links_adjusted generate item_A, item_B,
49                                      (weight <= 0 ? 0: weight) as weight;
50    -- Adjust the weights of the graph to improve recommendations.
51    ii_links                    =   recsys__AdjustItemItemGraphWeight(
52                                        ii_links_adjusted_filt,
53                                        item_weights,
54                                        $BAYESIAN_PRIOR
55                                    );
56
57    -- Use the item-item graph to create item-item recommendations.
58    $item_item_recs =  recsys__BuildItemItemRecommendationsFromGraph(
59                           ii_links,
60                           $NUM_RECS_PER_ITEM,
61                           $NUM_RECS_PER_ITEM
62                       );
63};
64
65
66/******* Load Data **********/
67
68--Get purchase signals
69purchase_input = load '$INPUT_PATH_PURCHASES' using org.apache.pig.piggybank.storage.JsonLoader(
70                    'row_id: int,
71                     movie_id: chararray,
72                     movie_name: chararray,
73                     user_id: chararray,
74                     purchase_price: int');
75
76--Get wishlist signals
77wishlist_input =  load '$INPUT_PATH_WISHLIST' using org.apache.pig.piggybank.storage.JsonLoader(
78                     'row_id: int,
79                      movie_id: chararray,
80                      movie_name: chararray,
81                      user_id: chararray');
82
83
84/******* Convert Data to Signals **********/
85
86-- Start with choosing 1 as max weight for a signal.
87purchase_signals = foreach purchase_input generate
88                        user_id    as user,
89                        movie_name as item,
90                        1.0        as weight;
91
92
93-- Start with choosing 0.5 as weight for wishlist items because that is a weaker signal than
94-- purchasing an item.
95wishlist_signals = foreach wishlist_input generate
96                        user_id    as user,
97                        movie_name as item,
98                        0.5        as weight;
99
100user_signals = union purchase_signals, wishlist_signals;
101
102
103/******** Changes for Modifying item-item links ******/
104inventory_input = load '$INPUT_PATH_INVENTORY' using org.apache.pig.piggybank.storage.JsonLoader(
105                     'movie_title: chararray,
106                      genres: bag{tuple(content:chararray)}');
107
108
109metadata = foreach inventory_input generate
110              FLATTEN(genres) as metadata_field,
111              movie_title as item;
112-- requires the macro to be written seperately
113  --NOTE this macro is defined within this file for clarity
114item_item_recs = recsys__GetItemItemRecommendations_ModifyCustom(user_signals, metadata);
115/******* No more changes ********/
116
117
118user_item_recs = recsys__GetUserItemRecommendations(user_signals, item_item_recs);
119
120--Completely unrelated code stuck in the middle
121data        =    LOAD 's3n://my-s3-bucket/path/to/responses'
122                 USING org.apache.pig.piggybank.storage.JsonLoader();
123responses   =    FOREACH data GENERATE object#'response' AS response: map[];
124out         =    FOREACH responses
125                 GENERATE response#'id' AS id: int, response#'thread' AS thread: chararray,
126                          response#'comments' AS comments: {t: (comment: chararray)};
127STORE out INTO 's3n://path/to/output' USING PigStorage('|');
128
129
130/******* Store recommendations **********/
131
132--  If your output folder exists already, hadoop will refuse to write data to it.
133
134rmf $OUTPUT_PATH/item_item_recs;
135rmf $OUTPUT_PATH/user_item_recs;
136
137store item_item_recs into '$OUTPUT_PATH/item_item_recs' using PigStorage();
138store user_item_recs into '$OUTPUT_PATH/user_item_recs' using PigStorage();
139
140-- STORE the item_item_recs into dynamo
141STORE item_item_recs
142 INTO '$OUTPUT_PATH/unused-ii-table-data'
143USING com.mortardata.pig.storage.DynamoDBStorage('$II_TABLE', '$AWS_ACCESS_KEY_ID', '$AWS_SECRET_ACCESS_KEY');
144
145-- STORE the user_item_recs into dynamo
146STORE user_item_recs
147 INTO '$OUTPUT_PATH/unused-ui-table-data'
148USING com.mortardata.pig.storage.DynamoDBStorage('$UI_TABLE', '$AWS_ACCESS_KEY_ID', '$AWS_SECRET_ACCESS_KEY');
149