# A Script to # - download the GeoLife (https://www.microsoft.com/en-us/research/publication/geolife-gps-trajectory-dataset-user-guide/) data # - transform it to a .csv file # - and cut records to given outlines # not very performant bc of preprocessing with pandas (takes about 30 min) # produces a 1,8 GB output csv file import os from pathlib import Path import csv import numpy as np import pandas as pd from io import BytesIO from zipfile import ZipFile from urllib.request import urlopen from tqdm.auto import tqdm ##### INPUT VARIABLES ##### # set path names RAW_DATA_PATH="raw/geolife" PROCESSED_DATA_PATH ="preprocessed" # set geo boundaries CUT_RECORDS_TO_BOUNDARY = True LNG_MIN=116.08 LNG_MAX=116.69 LAT_MIN=39.66 LAT_MAX=40.27 ############ Download data ############### # GEOLIFE if not os.path.exists(RAW_DATA_PATH): with tqdm(total=1, desc="Download geolife data",) as pbar: # progress bar os.makedirs(RAW_DATA_PATH) url = "https://download.microsoft.com/download/F/4/8/F4894AA5-FDBC-481E-9285-D5F8C4C4F039/Geolife%20Trajectories%201.3.zip" with urlopen(url) as zipresp: with ZipFile(BytesIO(zipresp.read())) as zfile: zfile.extractall( RAW_DATA_PATH ) pbar.update() else: print("Geolife data already exists. Download is skipped.") ############ Preprocess data ############### #### FUNCTIONS #### # clean header of plt files and write all data into single csv def geolife_clean_plt(root, user_id, input_filepath, traj_id): # read plt file with open(root + "/" + user_id + "/Trajectory/" + input_filepath, "rt") as fin: cr = csv.reader(fin) filecontents = [line for line in cr][6:] for l in filecontents: l.insert(0, traj_id) l.insert(0, user_id) return filecontents def geolife_data_to_df(dir): data = [] col_names = ["uid", "tid", "lat", "lng", "-", "Alt", "dayNo", "date", "time"] user_id_dirs = [ name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name)) ] with tqdm(total=len(user_id_dirs), desc="Preprocess Geolife data",) as pbar: # progress bar for user_id in np.sort(user_id_dirs): tempdirs = os.listdir(dir + "/" + user_id + "/Trajectory") subdirs = [] for item in tempdirs: if not item.endswith(".DS_Store"): subdirs.append(item) traj_id = 0 for subdir in subdirs: data += geolife_clean_plt(dir, user_id, subdir, traj_id) traj_id = traj_id + 1 pbar.update() return pd.DataFrame(data, columns=col_names) ##### ##### SCRIPT ##### if Path(os.path.join(PROCESSED_DATA_PATH, "geolife.csv")).exists(): print("Geolife data is already preprocessed. Processing is skipped.") df = pd.read_csv(os.path.join(PROCESSED_DATA_PATH, "geolife.csv")) else: if not Path(PROCESSED_DATA_PATH).exists(): os.makedirs(PROCESSED_DATA_PATH) geolife_dir = os.path.join( RAW_DATA_PATH, "Geolife Trajectories 1.3", "Data" ) df = geolife_data_to_df(geolife_dir) df["datetime"] = df.date + " " + df.time df["datetime"] = pd.to_datetime(df.datetime) df.drop("date", inplace=True, axis=1) df.drop("time", inplace=True, axis=1) ## fix datetime timezone df["datetime"] = ( df["datetime"] .dt.tz_localize("GMT") .dt.tz_convert("Asia/Shanghai") .dt.tz_localize(None) ) df.to_csv(os.path.join(PROCESSED_DATA_PATH, "geolife.csv"), index=False) ############ Cut to outline of given boundary ############### if CUT_RECORDS_TO_BOUNDARY: print("Records are cut to outline of given boundary.") df.lat = df.lat.astype(float) df.lng = df.lng.astype(float) df = df[(df.lat > LAT_MIN) & (df.lat < LAT_MAX) & (df.lng > LNG_MIN) & (df.lng < LNG_MAX)] df.to_csv(os.path.join(PROCESSED_DATA_PATH, "geolife_in_boundary.csv"), index=False)