-
-
Save mumrah/5768976 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*Pig script to convert the user,movie,rating,timestamp data to a user-user graph for running adsorption algorithm. | |
The format of the input data is | |
1::122::5::838985046 | |
*/ | |
/*Loading the data into a table. The delimiter might be different for different inputs. */ | |
REGISTER '/home/hadoop/RecSys/Pig_Tests/piggybank.jar'; | |
REGISTER /home/hadoop/RecSys/Pig_Tests/RecSys.jar; | |
raw_data = LOAD '/home/hadoop/PigTest/learn/data/MovieLeans_Sample.txt' USING org.apache.pig.piggybank.storage.MyRegExLoader('([^\\:]+)::([^\\:]+)::([^\\:]+)::([^\\:]+)') AS (userID:int,itemID:int,rating:double,unixTime:long); | |
/*Grouping the input data based on user and sorting the records based on timestamp.*/ | |
A = group raw_data by userID; | |
B = foreach A { | |
sorted_timestamp = order raw_data by unixTime; | |
generate sorted_timestamp,COUNT(raw_data); | |
}; | |
C = FOREACH B GENERATE flatten(pig.TrainTestSplit($0,$1)); | |
Split C into train_0 IF $4 == 0, test_0 if $4==1; | |
test = FOREACH test_0 generate $0 as userID_te, $1 as itemID_te, 1 as clickCount_te ; | |
train = FOREACH train_0 generate $0 as userID_tr, $1 as itemID_tr, 1 as clickCount_tr; | |
/*++++++++++++++++++++++++Above if correct. Below is Test++++++++++++++++++++++++++++*/ | |
/*Clean the training set - Find the number of interactions between a user and an item in the training set. | |
Group the training set using userID and itemID and sum the third column | |
*/ | |
group_train = GROUP train by (userID_tr,itemID_tr); | |
user_item_clickCount = FOREACH group_train generate flatten(group) AS (userID:int, itemID:int),SUM(train.clickCount_tr) AS clickCount; | |
/* | |
describe user_item_clickCount; | |
user_item_clickCount: {userID: int,itemID: int,clickCount: long} | |
*/ | |
/********Creating training labels using the user_item_clickCount. ************ */ | |
/*Groupign the file based on user.*/ | |
user_group = group user_item_clickCount by userID; | |
/* | |
describe user_group; | |
user_group: {group: int,user_item_clickCount: {(userID: int,itemID: int,clickCount:long)}} | |
*/ | |
/*This is where I am having problem*/ | |
user_item_dist = FOREACH user_group { | |
generate | |
group as userID, | |
SUM(user_item_clickCount.clickCount) as click_sum, | |
FLATTEN(user_item_clickCount); | |
}; | |
user_item_dist_normalized = foreach user_item_dist { | |
generate | |
userID, | |
user_item_clickCount::itemID as itemID, | |
user_item_clickCount::clickCount / click_sum as label; | |
}; | |
sum = FOREACH user_group generate SUM(user_item_clickCount.clickCount); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment