Skip to content
This repository was archived by the owner on Dec 18, 2021. It is now read-only.

Commit e852f61

Browse files
authored
Merge pull request #1 from Automattic/add-length-tests
Add tests to evaluate classification accuracy
2 parents cfc674d + 3a09156 commit e852f61

File tree

6 files changed

+3285
-9
lines changed

6 files changed

+3285
-9
lines changed

scripts/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Dataset generation scripts
2+
3+
This directory contains the Python code that was used to generate the datasets for the language detection tests.
4+
5+
## Setup
6+
7+
* Create a [virtualenv](https://virtualenv.pypa.io/en/stable/) with Python 3.5+ as the interpreter
8+
* Install Python requirements from `requirements.txt` (run `pip install -r requirements.txt`)
9+
10+
## Running the code
11+
12+
* Activate the virtualenv created in the setup phase
13+
* Run `python run.py` to view available commands

scripts/requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Baker==1.3
2+
ftfy==4.2.0
3+
requests==2.12.1
4+
xmltodict==0.10.2

scripts/run.py

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
from io import BytesIO
2+
import os
3+
import re
4+
from zipfile import ZipFile
5+
6+
import baker
7+
import ftfy
8+
import requests
9+
import xmltodict
10+
11+
_TEST_RESOURCES_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)),
12+
'../src/test/resources/org/xbib/elasticsearch/index/mapper/langdetect/')
13+
14+
# Supported languages according to https://github.com/shuyo/language-detection/blob/wiki/LanguageList.md
15+
_SUPPORTED_LANGUAGES = {
16+
'af': 'Afrikaans',
17+
'ar': 'Arabic',
18+
'bg': 'Bulgarian',
19+
'bn': 'Bengali',
20+
'ca': 'Catalan', # Short profile only
21+
'cs': 'Czech',
22+
'da': 'Danish',
23+
'de': 'German',
24+
'el': 'Greek',
25+
'en': 'English',
26+
'es': 'Spanish',
27+
'et': 'Estonian',
28+
'fa': 'Persian',
29+
'fi': 'Finnish',
30+
'fr': 'French',
31+
'gu': 'Gujarati',
32+
'he': 'Hebrew',
33+
'hi': 'Hindi',
34+
'hr': 'Croatian',
35+
'hu': 'Hungarian',
36+
'id': 'Indonesian',
37+
'it': 'Italian',
38+
'ja': 'Japanese',
39+
'kn': 'Kannada',
40+
'ko': 'Korean',
41+
'lt': 'Lithuanian',
42+
'lv': 'Latvian',
43+
'mk': 'Macedonian',
44+
'ml': 'Malayalam',
45+
'mr': 'Marathi',
46+
'ne': 'Nepali',
47+
'nl': 'Dutch',
48+
'no': 'Norwegian',
49+
'pa': 'Punjabi',
50+
'pl': 'Polish',
51+
'pt': 'Portuguese',
52+
'ro': 'Romanian',
53+
'ru': 'Russian',
54+
'si': 'Sinhalese', # Short profile only
55+
'sk': 'Slovak',
56+
'sl': 'Slovene',
57+
'so': 'Somali',
58+
'sq': 'Albanian',
59+
'sv': 'Swedish',
60+
'sw': 'Swahili',
61+
'ta': 'Tamil',
62+
'te': 'Telugu',
63+
'th': 'Thai',
64+
'tl': 'Tagalog',
65+
'tr': 'Turkish',
66+
'uk': 'Ukrainian',
67+
'ur': 'Urdu',
68+
'vi': 'Vietnamese',
69+
'zh-cn': 'SimplifiedChinese',
70+
'zh-tw': 'TraditionalChinese'
71+
}
72+
73+
74+
@baker.command
75+
def generate_udhr_dataset(out_path=os.path.join(_TEST_RESOURCES_PATH, 'udhr.tsv')):
76+
"""
77+
Download and generate the Universal Declaration of Human Rights (UDHR) dataset.
78+
79+
The generated dataset consists of translations of UDHR for all the languages supported by the plugin. This command
80+
applies minimal preprocessing to create the dataset, including: matching the downloaded dataset's language
81+
codes with those used by the plugin, removing each file's English intro, and dropping redundant whitespace.
82+
83+
:param out_path: output path for the dataset file, which will be written in tab-separated format with two
84+
columns: language code and text.
85+
"""
86+
# Download and extract the translations
87+
input_zip = ZipFile(BytesIO(requests.get('http://unicode.org/udhr/assemblies/udhr_txt.zip').content))
88+
filename_to_content = {name: input_zip.read(name).decode('utf-8') for name in input_zip.namelist()}
89+
90+
# Map the supported language codes to the translations. Generally, supported language codes (which come from old
91+
# Wikipedia dumps) match the UDHR BCP47 codes, but there are some exceptions.
92+
bcp47_code_to_supported_code = {
93+
# Monotonic Greek (apparently more modern with fewer diacritics than the polytonic system)
94+
'el-monoton': 'el',
95+
# German (the other option is de-1901, which seems too old)
96+
'de-1996': 'de',
97+
# Bokmål Norwegian (spoken by 85% of Norwegians -- the original Norwegian Wikipedia language)
98+
# See https://en.wikipedia.org/wiki/Norwegian_Wikipedia
99+
'nb': 'no',
100+
# There's only one Portuguese Wikipedia, so go with Portugal's Portuguese translation
101+
# See https://en.wikipedia.org/wiki/Portuguese_Wikipedia
102+
'pt-PT': 'pt',
103+
# Simplified and Traditional Chinese
104+
# The supported codes are a relic from old Chinese Wikipedia. Nowadays localisation is done on the fly.
105+
# See https://en.wikipedia.org/wiki/Chinese_Wikipedia
106+
'zh-Hans': 'zh-cn',
107+
'zh-Hant': 'zh-tw'
108+
}
109+
supported_code_to_filename = {}
110+
for file_info in xmltodict.parse(filename_to_content['index.xml'])['udhrs']['udhr']:
111+
supported_code = bcp47_code_to_supported_code.get(file_info['@bcp47'], file_info['@bcp47'])
112+
if supported_code in _SUPPORTED_LANGUAGES:
113+
# Some languages have multiple translations, so we just use the last one (which seems to be more recent)
114+
supported_code_to_filename[supported_code] = 'udhr_{}.txt'.format(file_info['@f'])
115+
assert len(_SUPPORTED_LANGUAGES) == len(supported_code_to_filename)
116+
117+
# Write the selected translations to the output file
118+
whitespace_pattern = re.compile(r'\s+')
119+
with open(out_path, 'w', encoding='utf-8') as out_file:
120+
for supported_code, filename in sorted(supported_code_to_filename.items()):
121+
# Remove the first 6 lines (English header) and clean up whitespace
122+
clean_text = whitespace_pattern.sub(' ', ' '.join(filename_to_content[filename].split('\n')[6:])).strip()
123+
out_file.write('{}\t{}\n'.format(supported_code, clean_text))
124+
125+
126+
@baker.command
127+
def generate_wordpress_translations_dataset(out_path=os.path.join(_TEST_RESOURCES_PATH, 'wordpress-translations.tsv'),
128+
texts_per_language=50):
129+
"""
130+
Download and generate the WordPress interface translations dataset.
131+
132+
The generated dataset consists of translations for WordPress 4.6.x versions. This command applies minimal processing
133+
to create the dataset, including: matching the dataset's language codes with those used by the plugin, unescaping
134+
HTML entities, and stripping variable placeholders, HTML tags, and redundant whitespace.
135+
136+
:param out_path: output path for the dataset file, which will be written in tab-separated format with two
137+
columns: language code and text.
138+
:param texts_per_language: number of texts to retain per langauge. The output file will contain up to this number of
139+
texts per language, excluding URL translations and word lists. The longest texts for
140+
each language are retained.
141+
"""
142+
url_template = 'https://translate.wordpress.org/projects/wp/4.6.x/{}/default/export-translations?format=json'
143+
requests_session = requests.Session()
144+
wp_placeholder_pattern = re.compile(r'(%\d*\$?[sd])|(###[A-Z_]+###)')
145+
html_tag_pattern = re.compile(r'<[^>]+>')
146+
whitespace_pattern = re.compile(r'\s+')
147+
with open(out_path, 'w', encoding='utf-8') as out_file:
148+
for supported_code in sorted(_SUPPORTED_LANGUAGES):
149+
# Use Australian and Bokmål Norwegian as the representative English and Norwegian variants, respectively
150+
if supported_code == 'en':
151+
wp_code = 'en-au'
152+
elif supported_code == 'no':
153+
wp_code = 'nb'
154+
else:
155+
wp_code = supported_code
156+
# Clean and retain the longest texts
157+
clean_texts_with_len = []
158+
for original_text, translations in requests_session.get(url_template.format(wp_code)).json().items():
159+
# Skip links and simple lists (e.g., stopwords aren't translated to Chinese)
160+
if original_text.startswith('http') or original_text.startswith('Comma-separated'):
161+
continue
162+
for translation in translations:
163+
# Skip texts that haven't been translated
164+
if supported_code != 'en' and original_text == translation:
165+
continue
166+
clean_text = wp_placeholder_pattern.sub('', translation)
167+
clean_text = ftfy.fixes.unescape_html(clean_text)
168+
clean_text = html_tag_pattern.sub('', clean_text)
169+
clean_text = whitespace_pattern.sub(' ', clean_text).strip()
170+
clean_texts_with_len.append((len(clean_text), clean_text))
171+
clean_texts_with_len.sort(reverse=True)
172+
for _, clean_text in clean_texts_with_len[:texts_per_language]:
173+
out_file.write('{}\t{}\n'.format(supported_code, clean_text))
174+
175+
if __name__ == '__main__':
176+
baker.run()

0 commit comments

Comments
 (0)