mirror of https://github.com/usememos/memos
				
				
				
			feat: update Chinese translations in zh.json and zh-Hant.json using locale_updater.py (#1506)
update zh.json and zh-Hant.json using locale_updater.pypull/1507/head
							parent
							
								
									66ed43cbcb
								
							
						
					
					
						commit
						0020498c10
					
				@ -0,0 +1,202 @@
 | 
			
		||||
# Author: Oaklight
 | 
			
		||||
# GitHub profile: https://github.com/Oaklight
 | 
			
		||||
# Date: April 9, 2023
 | 
			
		||||
# Description: This script is used to patch missing translations in a locale file.
 | 
			
		||||
 | 
			
		||||
# The script uses 'en.json' as the reference file to find missing keys in other locale files.
 | 
			
		||||
# It iterates through each field and their entries in 'en.json' and checks if the same field/entry exists in other files.
 | 
			
		||||
# If a field/entry is missing, the script prompts the source string, reference Google translation, and asks for confirmation or correction.
 | 
			
		||||
# The resulting file is saved as './*.proposed.json', and you should review it before merging and uploading.
 | 
			
		||||
 | 
			
		||||
# usage: locale_updater.py [-h] ref_locale tgt_locale
 | 
			
		||||
 | 
			
		||||
#TODO: add other NMT system for different preference and accuracy
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
import requests
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def flatten_json(nested_json, parent_key="", sep=":"):
 | 
			
		||||
    flattened_dict = {}
 | 
			
		||||
    for key, value in nested_json.items():
 | 
			
		||||
        new_key = parent_key + sep + key if parent_key else key
 | 
			
		||||
        if isinstance(value, dict):
 | 
			
		||||
            flattened_dict.update(flatten_json(value, new_key, sep))
 | 
			
		||||
        else:
 | 
			
		||||
            flattened_dict[new_key] = value
 | 
			
		||||
    return flattened_dict
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def unflatten_json(flattened_dict, sep=":"):
 | 
			
		||||
    nested_json = {}
 | 
			
		||||
    for key, value in flattened_dict.items():
 | 
			
		||||
        parts = key.split(sep)
 | 
			
		||||
        current = nested_json
 | 
			
		||||
        for part in parts[:-1]:
 | 
			
		||||
            if part not in current:
 | 
			
		||||
                current[part] = {}
 | 
			
		||||
            current = current[part]
 | 
			
		||||
        current[parts[-1]] = value
 | 
			
		||||
    return nested_json
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def sort_nested_json(nested_json):
 | 
			
		||||
    if isinstance(nested_json, dict):
 | 
			
		||||
        sorted_dict = {}
 | 
			
		||||
        for key in sorted(nested_json.keys()):
 | 
			
		||||
            sorted_dict[key] = sort_nested_json(nested_json[key])
 | 
			
		||||
        return sorted_dict
 | 
			
		||||
    elif isinstance(nested_json, list):
 | 
			
		||||
        sorted_list = []
 | 
			
		||||
        for item in nested_json:
 | 
			
		||||
            sorted_list.append(sort_nested_json(item))
 | 
			
		||||
        return sorted_list
 | 
			
		||||
    else:
 | 
			
		||||
        return nested_json
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def google_translate(
 | 
			
		||||
    source_text, source_language="en", target_language="zh-CN"
 | 
			
		||||
):
 | 
			
		||||
    # Create post content
 | 
			
		||||
    new_line = "\r\n"
 | 
			
		||||
    post_content = "q=" + source_text.replace(new_line, " ")
 | 
			
		||||
 | 
			
		||||
    # Send post request and get JSON response, using source_language and target_language
 | 
			
		||||
    # url = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=zh-CN&dt=t"
 | 
			
		||||
    url = f"https://translate.googleapis.com/translate_a/single?client=gtx&sl={source_language}&tl={target_language}&dt=t"
 | 
			
		||||
    headers = {"Content-type": "application/x-www-form-urlencoded"}
 | 
			
		||||
    response = requests.post(url, headers=headers, data=post_content.encode("utf-8"))
 | 
			
		||||
    json_value = response.json()
 | 
			
		||||
 | 
			
		||||
    # Extract translations from JSON
 | 
			
		||||
    translations = [item[0] for item in json_value[0]]
 | 
			
		||||
    translations = [t.replace(new_line, "") for t in translations]
 | 
			
		||||
    target_text = translations[0]
 | 
			
		||||
 | 
			
		||||
    return target_text
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_code_name(json_filename):
 | 
			
		||||
    # Remove extension and split language and country codes
 | 
			
		||||
    file_parts = json_filename.split(".")[0].split("_")
 | 
			
		||||
    lang_code = file_parts[0]
 | 
			
		||||
    country_code = file_parts[1] if len(file_parts) > 1 else ""
 | 
			
		||||
 | 
			
		||||
    # Map language code to code name
 | 
			
		||||
    lang_map = {
 | 
			
		||||
        "de": "de",
 | 
			
		||||
        "en": "en",
 | 
			
		||||
        "es": "es",
 | 
			
		||||
        "fr": "fr",
 | 
			
		||||
        "it": "it",
 | 
			
		||||
        "ko": "ko",
 | 
			
		||||
        "nl": "nl",
 | 
			
		||||
        "pl": "pl",
 | 
			
		||||
        "pt": "pt-BR",
 | 
			
		||||
        "ru": "ru",
 | 
			
		||||
        "sl": "sl",
 | 
			
		||||
        "sv": "sv",
 | 
			
		||||
        "tr": "tr",
 | 
			
		||||
        "uk": "uk",
 | 
			
		||||
        "vi": "vi",
 | 
			
		||||
        "zh-Hant": "zh-TW",
 | 
			
		||||
        "zh": "zh-CN",
 | 
			
		||||
    }
 | 
			
		||||
    code_name = lang_map.get(lang_code, "")
 | 
			
		||||
 | 
			
		||||
    # Add country code if available
 | 
			
		||||
    if country_code:
 | 
			
		||||
        code_name += "-" + country_code.upper()
 | 
			
		||||
 | 
			
		||||
    return code_name
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    # ref_locale = "./en.json"
 | 
			
		||||
    # tgt_locale = "./zh.json"
 | 
			
		||||
    # receive the reference locale and target locale from the command line using argparse
 | 
			
		||||
    import argparse
 | 
			
		||||
 | 
			
		||||
    parser = argparse.ArgumentParser()
 | 
			
		||||
    parser.add_argument("ref_locale", help="reference locale file")
 | 
			
		||||
    parser.add_argument("tgt_locale", help="target locale file")
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    ref_locale = args.ref_locale
 | 
			
		||||
    tgt_locale = args.tgt_locale
 | 
			
		||||
 | 
			
		||||
    ref_codename = get_code_name(ref_locale)
 | 
			
		||||
    tgt_codename = get_code_name(tgt_locale)
 | 
			
		||||
 | 
			
		||||
    with open(ref_locale, "r") as f:
 | 
			
		||||
        ref = json.load(f)
 | 
			
		||||
 | 
			
		||||
    with open(tgt_locale, "r") as f:
 | 
			
		||||
        tgt = json.load(f)
 | 
			
		||||
 | 
			
		||||
    # using the flatten_json function, produce a temp json for each locale and save to the disk
 | 
			
		||||
    ref_flat = flatten_json(ref)
 | 
			
		||||
    tgt_flat = flatten_json(tgt)
 | 
			
		||||
 | 
			
		||||
    # # save the flattened json to the disk
 | 
			
		||||
    # with open("ref_flat.json", "w") as f:
 | 
			
		||||
    #     json.dump(ref_flat, f, indent=2, ensure_ascii=False)
 | 
			
		||||
    # with open("tgt_flat.json", "w") as f:
 | 
			
		||||
    #     json.dump(tgt_flat, f, indent=2, ensure_ascii=False)
 | 
			
		||||
 | 
			
		||||
    # first diff the keys to inform the user of the missing keys
 | 
			
		||||
    missing_keys = set(ref_flat.keys()) - set(tgt_flat.keys())
 | 
			
		||||
    # print total number of missing keys, in red color, number as default
 | 
			
		||||
    if len(missing_keys) == 0:
 | 
			
		||||
        print("\033[92m All keys are present in the target locale \033[0m")
 | 
			
		||||
        exit()
 | 
			
		||||
    else:
 | 
			
		||||
        print(f"\033[91m Total missing keys: \033[0m {len(missing_keys)}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # formatted print line by line, wrap the missing key in red color, and the English translation in green color
 | 
			
		||||
    for key in missing_keys:
 | 
			
		||||
        # print(f"Missing key: {key} | English: {ref_flat[key]}")
 | 
			
		||||
        print(
 | 
			
		||||
            "\033[91m"
 | 
			
		||||
            + f"Missing key: {key}"
 | 
			
		||||
            + "\033[0m"
 | 
			
		||||
            + " | "
 | 
			
		||||
            + "\033[92m"
 | 
			
		||||
            + f"English: {ref_flat[key]}"
 | 
			
		||||
            + "\033[0m"
 | 
			
		||||
        )
 | 
			
		||||
    print("=============================================")
 | 
			
		||||
    print(f"\033[91m Total missing keys: \033[0m {len(missing_keys)}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # now compare the tgt_flat with ref_flat to find all missing keys and prompt to terminal for translation. Then save back to the tgt_flat
 | 
			
		||||
 | 
			
		||||
    # iterate over the missing key and their corresponding values in ref_flat, to get reference google translation using google_translate_to_chinese function
 | 
			
		||||
    # then present the reference translation to the user in the terminal
 | 
			
		||||
    # then present the user with a prompt to ask for translation
 | 
			
		||||
    for i, key in enumerate(missing_keys):
 | 
			
		||||
        print(
 | 
			
		||||
            f"============================================= {i + 1}/{len(missing_keys)}"
 | 
			
		||||
        )
 | 
			
		||||
        # print wrap the missing key in red color, and the English translation in green color
 | 
			
		||||
        print("\033[91m" + "Missing key: " + "\033[0m" + key)
 | 
			
		||||
        print("\033[92m" + f"{ref_codename}: " + "\033[0m" + ref_flat[key])
 | 
			
		||||
        # get reference translation from google translate, print in blue
 | 
			
		||||
        proposal_google = google_translate(ref_flat[key], ref_codename, tgt_codename)
 | 
			
		||||
        print("\033[94m" + f"Reference {tgt_codename} translation: " + "\033[0m" + proposal_google)
 | 
			
		||||
        # prompt user for translation, or enter to use the reference translation, in green color
 | 
			
		||||
        proposal = input("\033[92m" + "Enter translation: " + "\033[0m")
 | 
			
		||||
        if proposal == "":
 | 
			
		||||
            proposal = proposal_google
 | 
			
		||||
        # save the translation to the tgt_flat
 | 
			
		||||
        tgt_flat[key] = proposal
 | 
			
		||||
 | 
			
		||||
    # unflatten the ref_flat.json and tgt_flat.json back to the original format. save to another file
 | 
			
		||||
    ref_unflat = unflatten_json(ref_flat)
 | 
			
		||||
    tgt_unflat = unflatten_json(tgt_flat)
 | 
			
		||||
    # save the unflattened json to the disk, with original tgt file name with ".proposed" appended before .json
 | 
			
		||||
    # by getting the file name from from the tgt_locale path
 | 
			
		||||
    tgt_locale_name = tgt_locale.split("/")[-1].split(".")[0]
 | 
			
		||||
    with open(f"{tgt_locale_name}.proposed.json", "w") as f:
 | 
			
		||||
        json.dump(tgt_unflat, f, indent=2, ensure_ascii=False)
 | 
			
		||||
					Loading…
					
					
				
		Reference in New Issue