Skip to content

Commit 70a99a9

Browse files
authored
Fix preprocess filenames + cleaning. Bump version 0.4.1 (OpenNMT#983)
* Fix preprocess filenames + cleaning. Bump version 0.4.1
1 parent 6de42cd commit 70a99a9

File tree

4 files changed

+13
-6
lines changed

4 files changed

+13
-6
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88

99
### Fixes and improvements
1010

11+
## [0.4.1](https://github.com/OpenNMT/OpenNMT-py/tree/v0.4.1) (2018-10-11)
12+
* Fixed preprocessing files names, cleaning intermediary files.
13+
1114
## [0.4.0](https://github.com/OpenNMT/OpenNMT-py/tree/v0.4.0) (2018-10-08)
1215
* Fixed Speech2Text training (thanks Yuntian)
1316

onmt/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@
1717
__all__ = [onmt.inputters, onmt.encoders, onmt.decoders, onmt.models,
1818
onmt.utils, onmt.modules, "Trainer"]
1919

20-
__version__ = "0.2.0"
20+
__version__ = "0.4.1"

preprocess.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import glob
99
import sys
1010
import gc
11+
import os
1112
import codecs
1213
import torch
1314
from onmt.utils.logging import init_logger, logger
@@ -59,14 +60,14 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,
5960

6061
with codecs.open(src_corpus, "r", encoding="utf-8") as fsrc:
6162
with codecs.open(tgt_corpus, "r", encoding="utf-8") as ftgt:
63+
logger.info("Reading source and target files: %s %s."
64+
% (src_corpus, tgt_corpus))
6265
src_data = fsrc.readlines()
6366
tgt_data = ftgt.readlines()
6467

65-
src_corpus = "".join(src_corpus.split(".")[:-1])
66-
tgt_corpus = "".join(tgt_corpus.split(".")[:-1])
67-
6868
num_shards = int(len(src_data) / opt.shard_size)
6969
for x in range(num_shards):
70+
logger.info("Splitting shard %d." % x)
7071
f = codecs.open(src_corpus + ".{0}.txt".format(x), "w",
7172
encoding="utf-8")
7273
f.writelines(
@@ -79,6 +80,7 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,
7980
f.close()
8081
num_written = num_shards * opt.shard_size
8182
if len(src_data) > num_written:
83+
logger.info("Splitting shard %d." % num_shards)
8284
f = codecs.open(src_corpus + ".{0}.txt".format(num_shards),
8385
'w', encoding="utf-8")
8486
f.writelines(
@@ -96,6 +98,7 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,
9698
ret_list = []
9799

98100
for index, src in enumerate(src_list):
101+
logger.info("Building shard %d." % index)
99102
dataset = inputters.build_dataset(
100103
fields, opt.data_type,
101104
src_path=src,
@@ -124,7 +127,8 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,
124127
torch.save(dataset, pt_file)
125128

126129
ret_list.append(pt_file)
127-
130+
os.remove(src)
131+
os.remove(tgt_list[index])
128132
del dataset.examples
129133
gc.collect()
130134
del dataset

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
setup(name='OpenNMT-py',
66
description='A python implementation of OpenNMT',
7-
version='0.2.1',
7+
version='0.4.1',
88

99
packages=['onmt', 'onmt.encoders', 'onmt.modules', 'onmt.tests',
1010
'onmt.translate', 'onmt.decoders', 'onmt.inputters',

0 commit comments

Comments
 (0)