Skip to content

Commit 02b8880

Browse files
committed
add clean_text option
1 parent c76aa41 commit 02b8880

File tree

7 files changed

+2
-1
lines changed

7 files changed

+2
-1
lines changed

.github/FUNDING.yml

100644100755
File mode changed.

LICENSE.txt

100644100755
File mode changed.

README.md

100644100755
File mode changed.

bpeja.png

100644100755
File mode changed.

emoji.json

100644100755
File mode changed.

encode_bpe.py

100644100755
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def decode(self, tokens, breakline='\n'):
100100
parser.add_argument("--dst_file", help="destnation file", required=True )
101101
parser.add_argument("--num_process", help="process num", type=int, default=8 )
102102
parser.add_argument("--combine", help="Concatenate files with <|endoftext|> separator into chunks of this minimum size", type=int, default=50000 )
103+
parser.add_argument('--clean_text', action='store_true')
103104
args = parser.parse_args()
104105

105106
with open('ja-bpe.txt', encoding='utf-8') as f:
@@ -123,7 +124,7 @@ def _proc(i):
123124
raw_text += fp.read()
124125
raw_text += '<|endoftext|>'
125126
if len(raw_text) >= args.combine:
126-
tokens = np.stack(enc.encode(raw_text))
127+
tokens = np.stack(enc.encode(raw_text, clean=args.clean_text))
127128
token_chunks.append(tokens)
128129
raw_text = ''
129130
if raw_text and len(raw_text) > 0:

ja-bpe.txt

100644100755
File mode changed.

0 commit comments

Comments
 (0)