8
8
import glob
9
9
import sys
10
10
import gc
11
+ import os
11
12
import codecs
12
13
import torch
13
14
from onmt .utils .logging import init_logger , logger
@@ -59,14 +60,14 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,
59
60
60
61
with codecs .open (src_corpus , "r" , encoding = "utf-8" ) as fsrc :
61
62
with codecs .open (tgt_corpus , "r" , encoding = "utf-8" ) as ftgt :
63
+ logger .info ("Reading source and target files: %s %s."
64
+ % (src_corpus , tgt_corpus ))
62
65
src_data = fsrc .readlines ()
63
66
tgt_data = ftgt .readlines ()
64
67
65
- src_corpus = "" .join (src_corpus .split ("." )[:- 1 ])
66
- tgt_corpus = "" .join (tgt_corpus .split ("." )[:- 1 ])
67
-
68
68
num_shards = int (len (src_data ) / opt .shard_size )
69
69
for x in range (num_shards ):
70
+ logger .info ("Splitting shard %d." % x )
70
71
f = codecs .open (src_corpus + ".{0}.txt" .format (x ), "w" ,
71
72
encoding = "utf-8" )
72
73
f .writelines (
@@ -79,6 +80,7 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,
79
80
f .close ()
80
81
num_written = num_shards * opt .shard_size
81
82
if len (src_data ) > num_written :
83
+ logger .info ("Splitting shard %d." % num_shards )
82
84
f = codecs .open (src_corpus + ".{0}.txt" .format (num_shards ),
83
85
'w' , encoding = "utf-8" )
84
86
f .writelines (
@@ -96,6 +98,7 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,
96
98
ret_list = []
97
99
98
100
for index , src in enumerate (src_list ):
101
+ logger .info ("Building shard %d." % index )
99
102
dataset = inputters .build_dataset (
100
103
fields , opt .data_type ,
101
104
src_path = src ,
@@ -124,7 +127,8 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,
124
127
torch .save (dataset , pt_file )
125
128
126
129
ret_list .append (pt_file )
127
-
130
+ os .remove (src )
131
+ os .remove (tgt_list [index ])
128
132
del dataset .examples
129
133
gc .collect ()
130
134
del dataset
0 commit comments