|
| 1 | +from io import BytesIO |
| 2 | +import os |
| 3 | +import re |
| 4 | +from zipfile import ZipFile |
| 5 | + |
| 6 | +import baker |
| 7 | +import ftfy |
| 8 | +import requests |
| 9 | +import xmltodict |
| 10 | + |
| 11 | +_TEST_RESOURCES_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), |
| 12 | + '../src/test/resources/org/xbib/elasticsearch/index/mapper/langdetect/') |
| 13 | + |
| 14 | +# Supported languages according to https://github.com/shuyo/language-detection/blob/wiki/LanguageList.md |
| 15 | +_SUPPORTED_LANGUAGES = { |
| 16 | + 'af': 'Afrikaans', |
| 17 | + 'ar': 'Arabic', |
| 18 | + 'bg': 'Bulgarian', |
| 19 | + 'bn': 'Bengali', |
| 20 | + 'ca': 'Catalan', # Short profile only |
| 21 | + 'cs': 'Czech', |
| 22 | + 'da': 'Danish', |
| 23 | + 'de': 'German', |
| 24 | + 'el': 'Greek', |
| 25 | + 'en': 'English', |
| 26 | + 'es': 'Spanish', |
| 27 | + 'et': 'Estonian', |
| 28 | + 'fa': 'Persian', |
| 29 | + 'fi': 'Finnish', |
| 30 | + 'fr': 'French', |
| 31 | + 'gu': 'Gujarati', |
| 32 | + 'he': 'Hebrew', |
| 33 | + 'hi': 'Hindi', |
| 34 | + 'hr': 'Croatian', |
| 35 | + 'hu': 'Hungarian', |
| 36 | + 'id': 'Indonesian', |
| 37 | + 'it': 'Italian', |
| 38 | + 'ja': 'Japanese', |
| 39 | + 'kn': 'Kannada', |
| 40 | + 'ko': 'Korean', |
| 41 | + 'lt': 'Lithuanian', |
| 42 | + 'lv': 'Latvian', |
| 43 | + 'mk': 'Macedonian', |
| 44 | + 'ml': 'Malayalam', |
| 45 | + 'mr': 'Marathi', |
| 46 | + 'ne': 'Nepali', |
| 47 | + 'nl': 'Dutch', |
| 48 | + 'no': 'Norwegian', |
| 49 | + 'pa': 'Punjabi', |
| 50 | + 'pl': 'Polish', |
| 51 | + 'pt': 'Portuguese', |
| 52 | + 'ro': 'Romanian', |
| 53 | + 'ru': 'Russian', |
| 54 | + 'si': 'Sinhalese', # Short profile only |
| 55 | + 'sk': 'Slovak', |
| 56 | + 'sl': 'Slovene', |
| 57 | + 'so': 'Somali', |
| 58 | + 'sq': 'Albanian', |
| 59 | + 'sv': 'Swedish', |
| 60 | + 'sw': 'Swahili', |
| 61 | + 'ta': 'Tamil', |
| 62 | + 'te': 'Telugu', |
| 63 | + 'th': 'Thai', |
| 64 | + 'tl': 'Tagalog', |
| 65 | + 'tr': 'Turkish', |
| 66 | + 'uk': 'Ukrainian', |
| 67 | + 'ur': 'Urdu', |
| 68 | + 'vi': 'Vietnamese', |
| 69 | + 'zh-cn': 'SimplifiedChinese', |
| 70 | + 'zh-tw': 'TraditionalChinese' |
| 71 | +} |
| 72 | + |
| 73 | + |
| 74 | +@baker.command |
| 75 | +def generate_udhr_dataset(out_path=os.path.join(_TEST_RESOURCES_PATH, 'udhr.tsv')): |
| 76 | + """ |
| 77 | + Download and generate the Universal Declaration of Human Rights (UDHR) dataset. |
| 78 | +
|
| 79 | + The generated dataset consists of translations of UDHR for all the languages supported by the plugin. This command |
| 80 | + applies minimal preprocessing to create the dataset, including: matching the downloaded dataset's language |
| 81 | + codes with those used by the plugin, removing each file's English intro, and dropping redundant whitespace. |
| 82 | +
|
| 83 | + :param out_path: output path for the dataset file, which will be written in tab-separated format with two |
| 84 | + columns: language code and text. |
| 85 | + """ |
| 86 | + # Download and extract the translations |
| 87 | + input_zip = ZipFile(BytesIO(requests.get('http://unicode.org/udhr/assemblies/udhr_txt.zip').content)) |
| 88 | + filename_to_content = {name: input_zip.read(name).decode('utf-8') for name in input_zip.namelist()} |
| 89 | + |
| 90 | + # Map the supported language codes to the translations. Generally, supported language codes (which come from old |
| 91 | + # Wikipedia dumps) match the UDHR BCP47 codes, but there are some exceptions. |
| 92 | + bcp47_code_to_supported_code = { |
| 93 | + # Monotonic Greek (apparently more modern with fewer diacritics than the polytonic system) |
| 94 | + 'el-monoton': 'el', |
| 95 | + # German (the other option is de-1901, which seems too old) |
| 96 | + 'de-1996': 'de', |
| 97 | + # Bokmål Norwegian (spoken by 85% of Norwegians -- the original Norwegian Wikipedia language) |
| 98 | + # See https://en.wikipedia.org/wiki/Norwegian_Wikipedia |
| 99 | + 'nb': 'no', |
| 100 | + # There's only one Portuguese Wikipedia, so go with Portugal's Portuguese translation |
| 101 | + # See https://en.wikipedia.org/wiki/Portuguese_Wikipedia |
| 102 | + 'pt-PT': 'pt', |
| 103 | + # Simplified and Traditional Chinese |
| 104 | + # The supported codes are a relic from old Chinese Wikipedia. Nowadays localisation is done on the fly. |
| 105 | + # See https://en.wikipedia.org/wiki/Chinese_Wikipedia |
| 106 | + 'zh-Hans': 'zh-cn', |
| 107 | + 'zh-Hant': 'zh-tw' |
| 108 | + } |
| 109 | + supported_code_to_filename = {} |
| 110 | + for file_info in xmltodict.parse(filename_to_content['index.xml'])['udhrs']['udhr']: |
| 111 | + supported_code = bcp47_code_to_supported_code.get(file_info['@bcp47'], file_info['@bcp47']) |
| 112 | + if supported_code in _SUPPORTED_LANGUAGES: |
| 113 | + # Some languages have multiple translations, so we just use the last one (which seems to be more recent) |
| 114 | + supported_code_to_filename[supported_code] = 'udhr_{}.txt'.format(file_info['@f']) |
| 115 | + assert len(_SUPPORTED_LANGUAGES) == len(supported_code_to_filename) |
| 116 | + |
| 117 | + # Write the selected translations to the output file |
| 118 | + whitespace_pattern = re.compile(r'\s+') |
| 119 | + with open(out_path, 'w', encoding='utf-8') as out_file: |
| 120 | + for supported_code, filename in sorted(supported_code_to_filename.items()): |
| 121 | + # Remove the first 6 lines (English header) and clean up whitespace |
| 122 | + clean_text = whitespace_pattern.sub(' ', ' '.join(filename_to_content[filename].split('\n')[6:])).strip() |
| 123 | + out_file.write('{}\t{}\n'.format(supported_code, clean_text)) |
| 124 | + |
| 125 | + |
| 126 | +@baker.command |
| 127 | +def generate_wordpress_translations_dataset(out_path=os.path.join(_TEST_RESOURCES_PATH, 'wordpress-translations.tsv'), |
| 128 | + texts_per_language=50): |
| 129 | + """ |
| 130 | + Download and generate the WordPress interface translations dataset. |
| 131 | +
|
| 132 | + The generated dataset consists of translations for WordPress 4.6.x versions. This command applies minimal processing |
| 133 | + to create the dataset, including: matching the dataset's language codes with those used by the plugin, unescaping |
| 134 | + HTML entities, and stripping variable placeholders, HTML tags, and redundant whitespace. |
| 135 | +
|
| 136 | + :param out_path: output path for the dataset file, which will be written in tab-separated format with two |
| 137 | + columns: language code and text. |
| 138 | + :param texts_per_language: number of texts to retain per langauge. The output file will contain up to this number of |
| 139 | + texts per language, excluding URL translations and word lists. The longest texts for |
| 140 | + each language are retained. |
| 141 | + """ |
| 142 | + url_template = 'https://translate.wordpress.org/projects/wp/4.6.x/{}/default/export-translations?format=json' |
| 143 | + requests_session = requests.Session() |
| 144 | + wp_placeholder_pattern = re.compile(r'(%\d*\$?[sd])|(###[A-Z_]+###)') |
| 145 | + html_tag_pattern = re.compile(r'<[^>]+>') |
| 146 | + whitespace_pattern = re.compile(r'\s+') |
| 147 | + with open(out_path, 'w', encoding='utf-8') as out_file: |
| 148 | + for supported_code in sorted(_SUPPORTED_LANGUAGES): |
| 149 | + # Use Australian and Bokmål Norwegian as the representative English and Norwegian variants, respectively |
| 150 | + if supported_code == 'en': |
| 151 | + wp_code = 'en-au' |
| 152 | + elif supported_code == 'no': |
| 153 | + wp_code = 'nb' |
| 154 | + else: |
| 155 | + wp_code = supported_code |
| 156 | + # Clean and retain the longest texts |
| 157 | + clean_texts_with_len = [] |
| 158 | + for original_text, translations in requests_session.get(url_template.format(wp_code)).json().items(): |
| 159 | + # Skip links and simple lists (e.g., stopwords aren't translated to Chinese) |
| 160 | + if original_text.startswith('http') or original_text.startswith('Comma-separated'): |
| 161 | + continue |
| 162 | + for translation in translations: |
| 163 | + # Skip texts that haven't been translated |
| 164 | + if supported_code != 'en' and original_text == translation: |
| 165 | + continue |
| 166 | + clean_text = wp_placeholder_pattern.sub('', translation) |
| 167 | + clean_text = ftfy.fixes.unescape_html(clean_text) |
| 168 | + clean_text = html_tag_pattern.sub('', clean_text) |
| 169 | + clean_text = whitespace_pattern.sub(' ', clean_text).strip() |
| 170 | + clean_texts_with_len.append((len(clean_text), clean_text)) |
| 171 | + clean_texts_with_len.sort(reverse=True) |
| 172 | + for _, clean_text in clean_texts_with_len[:texts_per_language]: |
| 173 | + out_file.write('{}\t{}\n'.format(supported_code, clean_text)) |
| 174 | + |
| 175 | +if __name__ == '__main__': |
| 176 | + baker.run() |
0 commit comments