Skip to content

Instantly share code, notes, and snippets.

@n-lavrenko
Last active May 22, 2020 14:45
Python script that read .csv files that contain historical data of bars and ticks (in separate files), auto define winter weeks, based on data in file; change timezone, remove historical data on Saturdays and decrease 1 hour at winter weeks and write updated data to the new file.
# I use this script for make historical data of Ducascopy identical with FxClub forex broker.
# Then I can make deep historical optimization in MetaTrader 5 and use results in real trading on FxClub.
# You can edit some variables in code and code itself to edit Ducascopy historical data to make data as you need.
# Good luck.
import time
import pandas as pd
symbol = 'AUDUSD'
path_to_bars_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_mt5_bars.csv'
path_to_bars_edited_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_bars_ducas_clean_UTC+3.csv'
path_to_tick_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_mt5_ticks.csv'
path_to_ticks_edited_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_ticks_ducas_clean_UTC+3.csv'
bars_cols_names = ['date', 'time', 'open', 'high', 'low', 'close', 'tick_vol', 'vol', 'spread']
bars_dtype = {'date': object, 'open': str, 'high': str, 'low': str, 'close': str, 'tick_vol': str, 'vol': str}
ticks_cols_names = ['date', 'time', 'bid', 'ask']
ticks_dtype = {'date': object, 'bid': str, 'ask': str}
years_count = 14
bars_chunk_size = 100000
ticks_chunk_size = 1000000
increase_hours_timezone = 3
is_print_technical_info = False
def get_reader(path, cols, d_type, chunk_size):
return pd.read_csv(path, dtype=d_type, iterator=True, sep=',', header=None, names=cols,
usecols=range(0, len(cols)), chunksize=chunk_size)
def get_winter_weeks():
all_weeks_should_changed = []
reader = get_reader(path_to_bars_file, bars_cols_names, bars_dtype, bars_chunk_size)
for chunk in reader:
chunk['date_time'] = pd.to_datetime(chunk['date'] + chunk['time'], format='%Y%m%d%H:%M:%S')
chunk['date_time'] = chunk['date_time'] + pd.Timedelta(hours=increase_hours_timezone)
chunk['dayofweek'] = chunk['date_time'].dt.dayofweek
chunk['is_day_of'] = chunk['dayofweek'] > 4
chunk['week'] = chunk['date_time'].dt.week
chunk['year_week'] = chunk['date_time'].dt.year.astype('str') + '/' + chunk['week'].astype('str')
mask = chunk['is_day_of']
unique = chunk.loc[mask]['year_week'].unique()
all_weeks_should_changed.extend(unique)
result = list(dict.fromkeys(all_weeks_should_changed))
if is_print_technical_info:
print(result)
return result
def date_exclude_creater():
start_year = 2007
new_year_date_start = '01-01 00:00:00'
new_year_date_end = '01-01 23:59:59'
cristmas_date_start = '12-24 22:00:00'
cristmas_date_end = '12-26 02:59:59'
_all_new_years_start = []
_all_new_years_end = []
_all_cristmases_start = []
_all_cristmases_end = []
for d in range(years_count):
_all_new_years_start.append(str(start_year + d) + '-' + new_year_date_start)
_all_new_years_end.append(str(start_year + d) + '-' + new_year_date_end)
_all_cristmases_start.append(str(start_year + d) + '-' + cristmas_date_start)
_all_cristmases_end.append(str(start_year + d) + '-' + cristmas_date_end)
if is_print_technical_info:
print(_all_new_years_start, _all_new_years_end, _all_cristmases_start, _all_cristmases_end)
return _all_new_years_start, _all_new_years_end, _all_cristmases_start, _all_cristmases_end
def filter_dates(chunk):
for index in range(years_count):
y_s = pd.to_datetime(all_new_years_start[index])
y_e = pd.to_datetime(all_new_years_end[index])
x_s = pd.to_datetime(all_cristmases_start[index])
x_e = pd.to_datetime(all_cristmases_end[index])
mask_year = (chunk['date_time'] >= y_s) & (chunk['date_time'] <= y_e)
mask_xmas = (chunk['date_time'] >= x_s) & (chunk['date_time'] <= x_e)
chunk.drop(chunk.loc[mask_xmas].index, inplace=True)
chunk.drop(chunk.loc[mask_year].index, inplace=True)
return chunk
def read_edit_write(is_bars):
if is_bars:
reader = get_reader(path_to_bars_file, bars_cols_names, bars_dtype, bars_chunk_size)
cols_names = bars_cols_names
path_to_write = path_to_bars_edited_file
else:
reader = get_reader(path_to_tick_file, ticks_cols_names, ticks_dtype, ticks_chunk_size)
cols_names = ticks_cols_names
path_to_write = path_to_ticks_edited_file
with open(path_to_write, 'w') as path:
for chunk in reader:
chunk['date_time'] = pd.to_datetime(chunk['date'] + chunk['time'], format='%Y%m%d%H:%M:%S')
chunk['date_time'] = chunk['date_time'] + pd.Timedelta(hours=increase_hours_timezone)
chunk['dayofweek'] = chunk['date_time'].dt.dayofweek
chunk['week'] = chunk['date_time'].dt.week
chunk['year_week'] = chunk['date_time'].dt.year.astype('str') + '/' + chunk['week'].astype('str')
chunk['updated_date_time'] = chunk[chunk['year_week'].isin(unique_weeks)]['date_time'] - pd.Timedelta(hours=1)
chunk['updated_date_time'].fillna(chunk['date_time'], inplace=True)
# Filter saturdays:
chunk = chunk[chunk['dayofweek'] < 5]
# filter data by day !01.01, limit 25.12 till 20:00
chunk = filter_dates(chunk)
chunk['date'] = chunk['updated_date_time'].dt.strftime('%Y%m%d')
chunk['time'] = chunk['updated_date_time'].dt.strftime('%H:%M:%S')
chunk[cols_names].to_csv(path, sep=',', header=False, line_terminator='\n', index=None)
def edit_history():
start_time = time.perf_counter()
read_edit_write(True)
print(f'Bars of {symbol} is done! Spent: {(time.perf_counter() - start_time) / 60 :0.2f} minutes')
start_time = time.perf_counter()
read_edit_write(False)
print(f'Ticks of {symbol} is done! Spent: {(time.perf_counter() - start_time) / 60 / 60 :0.2f} hours')
print(f'Editing bars and ticks of {symbol} is started.')
(all_new_years_start, all_new_years_end, all_cristmases_start, all_cristmases_end) = date_exclude_creater()
unique_weeks = get_winter_weeks()
edit_history()
@n-lavrenko
Copy link
Author

This script use pandas library and can read huge files. For example I am read one file with size of 11 Gb.
P.S. also I cut some data on 01.01 and 25-26 December of each year (see function date_exclude_creater()).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment