-
-
Save emredjan/6fffc4f696d2201d1e3697b783f9590b to your computer and use it in GitHub Desktop.
| ''' | |
| Load Yelp JSON files and spit out CSV files | |
| Does not try to reinvent the wheel and uses pandas json_normalize | |
| Kinda hacky and requires a bit of RAM. But works, albeit naively. | |
| Tested with Yelp JSON files in dataset challenge round 12: | |
| https://www.yelp.com/dataset/challenge | |
| ''' | |
| import json | |
| from pathlib import Path | |
| from time import clock | |
| from typing import Dict, List | |
| import click | |
| import pandas as pd | |
| from pandas.io.json import json_normalize | |
| def read_json_as_array(json_file: Path) -> str: | |
| ''' | |
| Read a given Yelp JSON file as string, adding opening / closing | |
| brackets and commas to convert from separate JSON objects to | |
| an array of JSON objects, so JSON aware libraries can properly read | |
| Parameters | |
| ----------- | |
| json_file: path-like | |
| Returns | |
| ------- | |
| json_data: str | |
| String representation of JSON array | |
| ''' | |
| json_data = '' | |
| with open(json_file, 'r', encoding='utf-8') as in_file: | |
| for i, line in enumerate(in_file): | |
| if i == 0 and line: | |
| json_data += '[' + line | |
| elif line: | |
| json_data += ',' + line | |
| else: | |
| pass | |
| json_data += ']\n' | |
| return json_data | |
| def load_json(json_data: str) -> pd.DataFrame: | |
| ''' | |
| Read and normalize a given JSON array into a pandas DataFrame | |
| Parameters | |
| ----------- | |
| json_data: str | |
| String representation of JSON array | |
| Returns | |
| ------- | |
| df: pandas.DataFrame | |
| DataFrame containing the normalized JSON data | |
| ''' | |
| data = json.loads(json_data) | |
| df = json_normalize(data) | |
| return df | |
| def write_csv(df: pd.DataFrame, out_file: Path) -> None: | |
| ''' | |
| Write a given DataFrame to csv without index | |
| Parameters | |
| ----------- | |
| df: pandas.DataFrame | |
| DataFrame containing the normalized JSON data | |
| out_file: pathlib.Path | |
| A proper path of CSV file name | |
| ''' | |
| df.to_csv(out_file, index=False) | |
| @click.command() | |
| @click.argument('json-dir', type=click.Path(exists=True, dir_okay=True)) | |
| def main(json_dir): | |
| ''' | |
| Read a given directory containing Yelp JSON data and convert those | |
| files to CSV under 'csv_out' in the same directory | |
| ''' | |
| t0 = clock() | |
| json_dir = Path(json_dir) | |
| csv_dir = json_dir / 'csv_out' | |
| csv_dir.mkdir(exist_ok=True) | |
| file_list: List[Path] = json_dir.glob('*.json') | |
| with click.progressbar(file_list, label='Processing files..') as bar: | |
| for file in bar: | |
| csv_file = csv_dir / (file.stem + '.csv') | |
| data = read_json_as_array(file) | |
| df = load_json(data) | |
| write_csv(df, csv_file) | |
| t1 = clock() | |
| mins = (t1 - t0) // 60 | |
| secs = int((t1 - t0) % 60) | |
| timing = f'Conversion finished in {mins} minutes and {secs} seconds' | |
| click.secho(timing, fg='green') | |
| if __name__ == '__main__': | |
| main() # pylint: disable=E1120 |
Hmm, it works fine on my setup with Python 3.6, there shouldn't be any syntactical difference with Python 3.7. Can you give a bit more details? Do you have pandas library installed?
Hi, I am getting this error:
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 2 column 1 (char 3)
How can I solve it?
Hi,
I got this error:
line 86:
file_list: List[Path] = json_dir.glob('*.json')
^
SyntaxError: invalid syntax
May you please help me what it refers to?
another abandoned github
<;( ...conversion of any JSON data source, no matter how deep or complex, to a relational dataset with named columns must be an extremely difficult problem. I've researched it for weeks and can't find a solution other than the usual easy and simple.
I'm getting an error:
File "./json_to_csv.py", line 20
def read_json_as_array(json_file: Path) -> str:
^
SyntaxError: invalid syntax
with Python 3.7. Can you help with this?