Last active
          October 27, 2020 16:28 
        
      - 
      
- 
        Save jkuruzovich/142e10b56e49f95f2b57acd1bd55b0b8 to your computer and use it in GitHub Desktop. 
    This will loop through a set of data files, creating samples of each file. 
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # This will loop through a set of data files, creating samples of each file. | |
| import os, sys, importlib, glob | |
| import numpy as np | |
| from pathlib import Path | |
| import pandas as pd | |
| sys.path.append(os.path.join(Path.cwd().parent, 'modules')) | |
| import mongoparser as mp | |
| importlib.reload(mp) | |
| cwd_dir = Path.cwd() #For running locally | |
| base_dir = cwd_dir.parent | |
| data_dir = Path('/Volumes/fusion/data/crunchbase/2018/mongo_queries/production/csv/') | |
| sample_dir = base_dir / 'data' / 'sample'/ 'csv' | |
| def save_multiple_files(data_path, sample_path, dir='sample', extension='*.csv', samples=[1000, 10000]): | |
| files=glob.glob(str(data_path / extension)) | |
| names = [file.split('/')[-1].split('.')[0] for file in files] | |
| Path(sample_path).mkdir(parents=True, exist_ok=True) | |
| max_rows=np.max(samples) | |
| for i in range(len(files)): | |
| df=pd.read_csv(files[i], nrows=max_rows) | |
| for sample in samples: | |
| filename=(names[i]+str(sample)+'.csv') | |
| print("Saving: ",filename) | |
| df.iloc[0:sample,].to_csv(sample_path / filename, index=False) | |
| save_multiple_files(data_dir,sample_dir) | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment