# This script takes the files from the Movie Lens project (http://grouplens.org/datasets/movielens/) # reads them in line by line and saves them to a table for raw data in Cassandra from cassandra import cluster my_cluster = cluster.Cluster(['localhost']) session = my_cluster.connect('spark_demo') raw_movies = open( './ml-1m/movies.dat', 'rb') for idx,line in enumerate(raw_movies): session.execute('INSERT INTO raw_files (line,filename,contents) VALUES (%s,%s,%s)', (idx, 'movies.dat', line)) raw_ratings = open( './ml-1m/ratings.dat', 'rb') for idx,line in enumerate(raw_ratings): if (idx % 10000 ) == 0: print "{}".format(idx) session.execute_async('INSERT INTO raw_files (line,filename,contents) VALUES (%s,%s,%s)', (idx, 'ratings.dat',line)) raw_users = open('./ml-1m/users.dat', 'rb') for idx,line in enumerate(raw_users): session.execute('INSERT INTO raw_files (line,filename,contents) VALUES (%s,%s,%s)', (idx, 'users.dat', line))