mumrah · December 18, 2015 10:29
diff --git a/script.pig b/script.pig
 /*Pig script to convert the user,movie,rating,timestamp data to a user-user graph for running adsorption algorithm. 

 The format of the input data is

 1::122::5::838985046


 */

 /*Loading the data into a table. The delimiter might be different for different inputs. */

 REGISTER '/home/hadoop/RecSys/Pig_Tests/piggybank.jar';
 REGISTER /home/hadoop/RecSys/Pig_Tests/RecSys.jar;

 raw_data = LOAD '/home/hadoop/PigTest/learn/data/MovieLeans_Sample.txt'  USING org.apache.pig.piggybank.storage.MyRegExLoader('([^\\:]+)::([^\\:]+)::([^\\:]+)::([^\\:]+)') AS (userID:int,itemID:int,rating:double,unixTime:long);

 /*Grouping the input data based on user and sorting the records based on timestamp.*/
 A = group raw_data by userID;
 B = foreach A {
    sorted_timestamp = order raw_data by unixTime;
    generate sorted_timestamp,COUNT(raw_data);
 };


 C = FOREACH B GENERATE flatten(pig.TrainTestSplit($0,$1));

 Split C into train_0 IF $4 == 0, test_0 if $4==1;

 test = FOREACH test_0 generate $0 as userID_te, $1 as itemID_te, 1 as clickCount_te   ;
 train = FOREACH train_0 generate $0 as userID_tr, $1 as itemID_tr, 1 as clickCount_tr;

 /*++++++++++++++++++++++++Above if correct. Below is Test++++++++++++++++++++++++++++*/

 /*Clean the training set - Find the number of interactions between a user and an item in the training set. 

 Group the training set using userID and itemID and sum the third column
 */

 group_train = GROUP train by (userID_tr,itemID_tr);

 user_item_clickCount = FOREACH group_train generate flatten(group) AS (userID:int, itemID:int),SUM(train.clickCount_tr) AS clickCount;


 /*
 describe user_item_clickCount;
 user_item_clickCount: {userID: int,itemID: int,clickCount: long}

 */


 /********Creating training labels using the user_item_clickCount. ************ */

 /*Groupign the file based on user.*/

 user_group = group user_item_clickCount by userID;

 /*
 describe user_group;
 user_group: {group: int,user_item_clickCount: {(userID: int,itemID: int,clickCount:long)}}
 */

 /*This is where I am having problem*/

 user_item_dist = FOREACH user_group {
    generate
      group as userID,
      SUM(user_item_clickCount.clickCount) as click_sum,
      FLATTEN(user_item_clickCount);
 };

 user_item_dist_normalized = foreach user_item_dist {
  generate
    userID,
    user_item_clickCount::itemID as itemID,
    user_item_clickCount::clickCount / click_sum as label;
 };

 sum = FOREACH user_group generate SUM(user_item_clickCount.clickCount);
	/*Pig script to convert the user,movie,rating,timestamp data to a user-user graph for running adsorption algorithm.

	The format of the input data is

	1::122::5::838985046


	*/

	/Loading the data into a table. The delimiter might be different for different inputs. /

	REGISTER '/home/hadoop/RecSys/Pig_Tests/piggybank.jar';
	REGISTER /home/hadoop/RecSys/Pig_Tests/RecSys.jar;

	raw_data = LOAD '/home/hadoop/PigTest/learn/data/MovieLeans_Sample.txt' USING org.apache.pig.piggybank.storage.MyRegExLoader('([^\\:]+)::([^\\:]+)::([^\\:]+)::([^\\:]+)') AS (userID:int,itemID:int,rating:double,unixTime:long);

	/Grouping the input data based on user and sorting the records based on timestamp./
	A = group raw_data by userID;
	B = foreach A {
	sorted_timestamp = order raw_data by unixTime;
	generate sorted_timestamp,COUNT(raw_data);
	};


	C = FOREACH B GENERATE flatten(pig.TrainTestSplit($0,$1));

	Split C into train_0 IF $4 == 0, test_0 if $4==1;

	test = FOREACH test_0 generate $0 as userID_te, $1 as itemID_te, 1 as clickCount_te ;
	train = FOREACH train_0 generate $0 as userID_tr, $1 as itemID_tr, 1 as clickCount_tr;

	/++++++++++++++++++++++++Above if correct. Below is Test++++++++++++++++++++++++++++/

	/*Clean the training set - Find the number of interactions between a user and an item in the training set.

	Group the training set using userID and itemID and sum the third column
	*/

	group_train = GROUP train by (userID_tr,itemID_tr);

	user_item_clickCount = FOREACH group_train generate flatten(group) AS (userID:int, itemID:int),SUM(train.clickCount_tr) AS clickCount;


	/*
	describe user_item_clickCount;
	user_item_clickCount: {userID: int,itemID: int,clickCount: long}

	*/


	/******Creating training labels using the user_item_clickCount. ********** */

	/Groupign the file based on user./

	user_group = group user_item_clickCount by userID;

	/*
	describe user_group;
	user_group: {group: int,user_item_clickCount: {(userID: int,itemID: int,clickCount:long)}}
	*/

	/This is where I am having problem/

	user_item_dist = FOREACH user_group {
	generate
	group as userID,
	SUM(user_item_clickCount.clickCount) as click_sum,
	FLATTEN(user_item_clickCount);
	};

	user_item_dist_normalized = foreach user_item_dist {
	generate
	userID,
	user_item_clickCount::itemID as itemID,
	user_item_clickCount::clickCount / click_sum as label;
	};

	sum = FOREACH user_group generate SUM(user_item_clickCount.clickCount);