Created
July 10, 2019 18:37
-
-
Save itsderek23/90f2197b94393172c6c786dad3a6c4bd to your computer and use it in GitHub Desktop.
SERP Analysis - Load CSV into a Pandas Dataframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import dateparser | |
import pandas as pd | |
# [keys, row['clicks'], row['impressions'], row['ctr'], row['position']] | |
# cpu steal,3.0,4.0,0.75,1.0,gsc_property,worldwide,mobile, | |
HEADERS = {0:"query", 1: "clicks", 2: "impressions", 3: "ctr", 4: "position", 5: "property", | |
6: "location", 7: "device"} | |
def csv_files(path="/Users/dlite/projects/play/Google-Search-Console-bulk-query"): | |
return find_files(path,r'csv') | |
def find_files(path, regex): | |
regObj = re.compile(regex) | |
res = [] | |
for root, dirs, fnames in os.walk(path): | |
for fname in fnames: | |
if len(regObj.findall(fname)) > 0 : | |
# files are empty if no data for the specified date. Skip. | |
if len(open(os.path.join(root, fname)).readlines( )) > 0: | |
res.append(os.path.join(root, fname)) | |
return res | |
def files_and_dates(files): | |
files_and_dates = [] | |
for file in files: | |
date = {"date": date_from_filename(file)} | |
date["file"] = file | |
files_and_dates.append(date) | |
return files_and_dates | |
def date_from_filename(filename): | |
dates = re.findall(r'\d{8}',filename) | |
# dates look like _20190501 | |
date = dateparser.parse(date_string=dates[0], date_formats=["%Y%m%d"]) | |
return date | |
def load_df(file_info): | |
df = pd.read_csv(file_info['file'],thousands=',',header=None) | |
df['date'] = file_info['date'] | |
return df | |
files = csv_files() | |
with_dates = files_and_dates(csv_files()) | |
df_iterator = map(lambda file_info: load_df(file_info), with_dates) | |
df = pd.concat(df_iterator) | |
df.rename(columns=HEADERS,inplace=True) | |
df.drop(columns=[8],inplace=True) # don't know what this is...it is empty. | |
df.head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment