Skip to content

Commit ed3b817

Browse files
author
Philippe Rémy
authored
Merge pull request philipperemy#1 from philipperemy/quantiles
Quantiles
2 parents 6495a51 + 9f7d3f5 commit ed3b817

File tree

4 files changed

+87
-39
lines changed

4 files changed

+87
-39
lines changed

data_generator.py

Lines changed: 61 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,54 +5,89 @@
55
from uuid import uuid4
66

77
import numpy as np
8+
import pandas as pd
89

910
from data_manager import file_processor
11+
from returns_quantization import add_returns_in_place
1012
from utils import *
1113

14+
np.set_printoptions(threshold=np.nan)
15+
pd.set_option('display.height', 1000)
16+
pd.set_option('display.max_rows', 500)
17+
pd.set_option('display.max_columns', 500)
18+
pd.set_option('display.width', 1000)
19+
20+
21+
def generate_quantiles(data_folder, bitcoin_file):
22+
def get_label(btc_df, btc_slice, i, slice_size):
23+
class_name = str(btc_df[i + slice_size:i + slice_size + 1]['close_price_returns_labels'].values[0])
24+
return class_name
25+
26+
return generate_cnn_dataset(data_folder, bitcoin_file, get_label)
27+
28+
29+
def generate_up_down(data_folder, bitcoin_file):
30+
def get_price_direction(btc_df, btc_slice, i, slice_size):
31+
last_price = btc_slice[-2:-1]['price_close'].values[0]
32+
next_price = btc_df[i + slice_size:i + slice_size + 1]['price_close'].values[0]
33+
if last_price < next_price:
34+
class_name = 'UP'
35+
else:
36+
class_name = 'DOWN'
37+
return class_name
38+
39+
return generate_cnn_dataset(data_folder, bitcoin_file, get_price_direction)
40+
41+
42+
def generate_cnn_dataset(data_folder, bitcoin_file, get_class_name):
43+
btc_df = file_processor(bitcoin_file)
44+
btc_df, levels = add_returns_in_place(btc_df)
45+
46+
print('-' * 80)
47+
print('Those values should be roughly equal to 1/len(levels):')
48+
for ii in range(len(levels)):
49+
print(ii, np.mean((btc_df['close_price_returns_labels'] == ii).values))
50+
print(levels)
51+
print('-' * 80)
1252

13-
def generate(data_folder, bitcoin_file):
14-
p = file_processor(bitcoin_file)
1553
slice_size = 40
1654
test_every_steps = 10
17-
n = len(p) - slice_size
55+
n = len(btc_df) - slice_size
1856

1957
shutil.rmtree(data_folder, ignore_errors=True)
2058
for epoch in range(int(1e6)):
2159
st = time()
2260

2361
i = np.random.choice(n)
24-
sl = p[i:i + slice_size]
62+
btc_slice = btc_df[i:i + slice_size]
2563

26-
if sl.isnull().values.any():
64+
if btc_slice.isnull().values.any():
2765
# sometimes prices are discontinuous and nothing happened in one 5min bucket.
28-
# in that case, we consider this slice as wrong and we ask for a new one.
66+
# in that case, we consider this slice as wrong and we raise an exception.
2967
# it's likely to happen at the beginning of the data set where the volumes are low.
30-
continue
31-
32-
last_price = sl[-2:-1]['price_close'].values[0]
33-
next_price = p[i + slice_size:i + slice_size + 1]['price_close'].values[0]
34-
35-
if last_price < next_price:
36-
direction = 'UP'
37-
else:
38-
direction = 'DOWN'
68+
raise Exception('NaN values detected. Please remove them.')
3969

40-
save_dir = os.path.join(data_folder, 'train', direction)
70+
class_name = get_class_name(btc_df, btc_slice, i, slice_size)
71+
save_dir = os.path.join(data_folder, 'train', class_name)
4172
if epoch % test_every_steps == 0:
42-
save_dir = os.path.join(data_folder, 'test', direction)
73+
save_dir = os.path.join(data_folder, 'test', class_name)
4374
mkdir_p(save_dir)
44-
save_to_file(sl, filename=save_dir + '/' + str(uuid4()) + '.png')
45-
46-
print('epoch = {0}, time = {1:.3f}'.format(str(epoch).zfill(8), time() - st))
75+
filename = save_dir + '/' + str(uuid4()) + '.png'
76+
save_to_file(btc_slice, filename=filename)
77+
print('epoch = {0}, time = {1:.3f}, filename = {2}'.format(str(epoch).zfill(8), time() - st, filename))
4778

4879

4980
def main():
50-
arg = sys.argv
51-
assert len(arg) == 3, 'Usage: python3 {} DATA_FOLDER_TO_STORE_GENERATED_DATASET ' \
52-
'BITCOIN_MARKET_DATA_CSV_PATH'.format(arg[0])
53-
data_folder = arg[1]
54-
bitcoin_file = arg[2]
55-
generate(data_folder, bitcoin_file)
81+
args = sys.argv
82+
assert len(args) == 4, 'Usage: python3 {} DATA_FOLDER_TO_STORE_GENERATED_DATASET ' \
83+
'BITCOIN_MARKET_DATA_CSV_PATH USE_QUANTILES'.format(args[0])
84+
data_folder = args[1]
85+
bitcoin_file = args[2]
86+
use_quantiles = int(args[3])
87+
88+
data_gen_func = generate_quantiles if use_quantiles else generate_up_down
89+
print('Using: {}'.format(data_gen_func))
90+
data_gen_func(data_folder, bitcoin_file)
5691

5792

5893
if __name__ == '__main__':

data_manager.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,11 @@ def file_processor(data_file):
1515
v = pd.DataFrame(d['volume'].resample('5Min').sum())
1616
v.columns = ['volume']
1717
p['volume'] = v['volume']
18+
19+
# drop NaN values.
20+
# for example sometimes we don't have data for like one hour in a row.
21+
# So we have NaN buckets of 5Min in this particular hour.
22+
# Our convention is to avoid those NaN values and drop them!
23+
p = p.dropna()
1824
p.to_csv('/tmp/bitcoin_coinbase_M5.csv', sep='\t')
1925
return p

returns_quantization.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,30 @@
66
from utils import compute_returns
77

88

9-
def generate(data_folder, bitcoin_file):
10-
p = file_processor(bitcoin_file)
11-
close_prices_returns = compute_returns(p)
9+
def add_returns_in_place(df): # modifies df
10+
close_prices_returns = compute_returns(df)
1211
num_bins = 10
13-
returns_bins = pd.cut(close_prices_returns, num_bins).values.categories
14-
returns_labels = pd.cut(close_prices_returns, num_bins, labels=False)
15-
print(returns_bins)
16-
print(returns_labels)
12+
returns_bins = pd.qcut(close_prices_returns, num_bins)
13+
bins_categories = returns_bins.values.categories
14+
returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False)
15+
16+
df['close_price_returns'] = close_prices_returns
17+
df['close_price_returns_bins'] = returns_bins
18+
df['close_price_returns_labels'] = returns_labels
19+
20+
return df, bins_categories
21+
22+
23+
def generate_bins(bitcoin_file):
24+
p = file_processor(bitcoin_file)
25+
print(add_returns_in_place(p))
1726

1827

1928
def main():
2029
arg = sys.argv
21-
assert len(arg) == 3, 'Usage: python3 {} DATA_FOLDER_TO_STORE_GENERATED_DATASET ' \
22-
'BITCOIN_MARKET_DATA_CSV_PATH'.format(arg[0])
23-
data_folder = arg[1]
24-
bitcoin_file = arg[2]
25-
generate(data_folder, bitcoin_file)
30+
assert len(arg) == 2, 'Usage: python3 {} BITCOIN_MARKET_DATA_CSV_PATH'.format(arg[0])
31+
bitcoin_file = arg[1]
32+
generate_bins(bitcoin_file)
2633

2734

2835
if __name__ == '__main__':

utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
def compute_returns(p):
22
close_prices = p['price_close']
33
close_prices_returns = 100 * ((close_prices.shift(-1) - close_prices) / close_prices).fillna(0.0)
4-
return close_prices_returns
4+
return close_prices_returns.shift(1).fillna(0)
55

66

77
def plot_p(df):

0 commit comments

Comments
 (0)