Last active
June 11, 2019 19:15
-
-
Save victorkohler/e7e6a11ada6d2e6616f841f4a7a53536 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_negatives(uids, iids, items, df_test): | |
"""Returns a pandas dataframe of 100 negative interactions | |
based for each user in df_test. | |
Args: | |
uids (np.array): Numpy array of all user ids. | |
iids (np.array): Numpy array of all item ids. | |
items (list): List of all unique items. | |
df_test (dataframe): Our test set. | |
Returns: | |
df_neg (dataframe): dataframe with 100 negative items | |
for each (u, i) pair in df_test. | |
""" | |
negativeList = [] | |
test_u = df_test['user_id'].values.tolist() | |
test_i = df_test['item_id'].values.tolist() | |
test_ratings = list(zip(test_u, test_i)) | |
zipped = set(zip(uids, iids)) | |
for (u, i) in test_ratings: | |
negatives = [] | |
negatives.append((u, i)) | |
for t in range(100): | |
j = np.random.randint(len(items)) # Get random item id. | |
while (u, j) in zipped: # Check if there is an interaction | |
j = np.random.randint(len(items)) # If yes, generate a new item id | |
negatives.append(j) # Once a negative interaction is found we add it. | |
negativeList.append(negatives) | |
df_neg = pd.DataFrame(negativeList) | |
return df_neg | |
def mask_first(x): | |
""" | |
Return a list of 0 for the first item and 1 for all others | |
""" | |
result = np.ones_like(x) | |
result[0] = 0 | |
return result | |
def train_test_split(df): | |
""" | |
Splits our original data into one test and one | |
training set. | |
The test set is made up of one item for each user. This is | |
our holdout item used to compute Top@K later. | |
The training set is the same as our original data but | |
without any of the holdout items. | |
Args: | |
df (dataframe): Our original data | |
Returns: | |
df_train (dataframe): All of our data except holdout items | |
df_test (dataframe): Only our holdout items. | |
""" | |
# Create two copies of our dataframe that we can modify | |
df_test = df.copy(deep=True) | |
df_train = df.copy(deep=True) | |
# Group by user_id and select only the first item for | |
# each user (our holdout). | |
df_test = df_test.groupby(['user_id']).first() | |
df_test['user_id'] = df_test.index | |
df_test = df_test[['user_id', 'item_id', 'plays']] | |
del df_test.index.name | |
# Remove the same items as we for our test set in our training set. | |
mask = df.groupby(['user_id'])['user_id'].transform(mask_first).astype(bool) | |
df_train = df.loc[mask] | |
return df_train, df_test |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment