Spaces:
Running
Running
| import subprocess | |
| iso_to_tesseract = { | |
| "af": "afr", # Afrikaans | |
| "all": "all", # Allar | |
| "am": "amh", # Amharic | |
| "ar": "ara", # Arabic | |
| "as": "asm", # Assamese | |
| "az": "aze", # Azerbaijani | |
| "aze-cyrl": "aze-cyrl", # Azerbaijani (Cyrillic) | |
| "be": "bel", # Belarusian | |
| "bn": "ben", # Bangla | |
| "bo": "bod", # Tibetan | |
| "bs": "bos", # Bosnian | |
| "br": "bre", # Breton | |
| "bg": "bul", # Bulgarian | |
| "ca": "cat", # Catalan | |
| "ceb": "ceb", # Cebuano | |
| "cs": "ces", # Czech | |
| "zh-Hans": "chi_sim", # Chinese (Simplified) | |
| "chi-sim-vert": "chi-sim-vert", # Chinese (Simplified) vertical | |
| "zh-Hant": "chi_tra", # Chinese (Traditional) | |
| "chi-tra-vert": "chi-tra-vert", # Chinese (Traditional) vertical | |
| "chr": "chr", # Cherokee | |
| "co": "cos", # Corsican | |
| "cy": "cym", # Welsh | |
| "da": "dan", # Danish | |
| "de": "deu", # German | |
| "dv": "div", # Divehi | |
| "dz": "dzo", # Dzongkha | |
| "el": "ell", # Greek | |
| "en": "eng", # English | |
| "enm": "enm", # Middle English | |
| "eo": "epo", # Esperanto | |
| "et": "est", # Estonian | |
| "eu": "eus", # Basque | |
| "fo": "fao", # Faroese | |
| "fa": "fas", # Persian | |
| "fil": "fil", # Filipino | |
| "fi": "fin", # Finnish | |
| "fr": "fra", # French | |
| "frk": "frk", # Frankish | |
| "frm": "frm", # Middle French | |
| "fy": "fry", # Western Frisian | |
| "gd": "gla", # Scottish Gaelic | |
| "ga": "gle", # Irish | |
| "gl": "glg", # Galician | |
| "grc": "grc", # Ancient Greek | |
| "gu": "guj", # Gujarati | |
| "ht": "hat", # Haitian Creole | |
| "he": "heb", # Hebrew | |
| "hi": "hin", # Hindi | |
| "hr": "hrv", # Croatian | |
| "hu": "hun", # Hungarian | |
| "hy": "hye", # Armenian | |
| "iu": "iku", # Inuktitut | |
| "id": "ind", # Indonesian | |
| "is": "isl", # Icelandic | |
| "it": "ita", # Italian | |
| "ita-old": "ita-old", # Old Italian | |
| "jv": "jav", # Javanese | |
| "ja": "jpn", # Japanese | |
| "jpn-vert": "jpn-vert", # Japanese vertical | |
| "kn": "kan", # Kannada | |
| "ka": "kat", # Georgian | |
| "kat-old": "kat-old", # Old Georgian | |
| "kk": "kaz", # Kazakh | |
| "km": "khm", # Khmer | |
| "ky": "kir", # Kyrgyz | |
| "kmr": "kmr", # Northern Kurdish | |
| "ko": "kor", # Korean | |
| "kor-vert": "kor_vert", # Korean vertical | |
| "lo": "lao", # Lao | |
| "la": "lat", # Latin | |
| "lv": "lav", # Latvian | |
| "lt": "lit", # Lithuanian | |
| "lb": "ltz", # Luxembourgish | |
| "ml": "mal", # Malayalam | |
| "mr": "mar", # Marathi | |
| "mk": "mkd", # Macedonian | |
| "mt": "mlt", # Maltese | |
| "mn": "mon", # Mongolian | |
| "mi": "mri", # Māori | |
| "ms": "msa", # Malay | |
| "my": "mya", # Burmese | |
| "ne": "nep", # Nepali | |
| "nl": "nld", # Dutch | |
| "no": "nor", # Norwegian | |
| "oc": "oci", # Occitan | |
| "or": "ori", # Odia | |
| "osd": "osd", # Unknown language [osd] | |
| "pa": "pan", # Punjabi | |
| "pl": "pol", # Polish | |
| "pt": "por", # Portuguese | |
| "ps": "pus", # Pashto | |
| "qu": "que", # Quechua | |
| "ro": "ron", # Romanian | |
| "ru": "rus", # Russian | |
| "sa": "san", # Sanskrit | |
| "script-arab": "script-arab", # Arabic script | |
| "script-armn": "script-armn", # Armenian script | |
| "script-beng": "script-beng", # Bengali script | |
| "script-cans": "script-cans", # Canadian Aboriginal script | |
| "script-cher": "script-cher", # Cherokee script | |
| "script-cyrl": "script-cyrl", # Cyrillic script | |
| "script-deva": "script-deva", # Devanagari script | |
| "script-ethi": "script-ethi", # Ethiopic script | |
| "script-frak": "script-frak", # Frankish script | |
| "script-geor": "script-geor", # Georgian script | |
| "script-grek": "script-grek", # Greek script | |
| "script-gujr": "script-gujr", # Gujarati script | |
| "script-guru": "script-guru", # Gurmukhi script | |
| "script-hang": "script-hang", # Hangul script | |
| "script-hang-vert": "script-hang-vert", # Hangul script vertical | |
| "script-hans": "script-hans", | |
| "script-hans-vert": "script-hans-vert", | |
| "script-hant": "script-hant", | |
| "script-hant-vert": "script-hant-vert", | |
| "script-hebr": "script-hebr", # Hebrew script | |
| "script-jpan": "script-jpan", # Japanese script | |
| "script-jpan-vert": "script-jpan-vert", # Japanese script vertical | |
| "script-khmr": "script-khmr", # Khmer script | |
| "script-knda": "script-knda", # Kannada script | |
| "script-laoo": "script-laoo", # Lao script | |
| "script-latn": "script-latn", | |
| "script-mlym": "script-mlym", # Malayalam script | |
| "script-mymr": "script-mymr", # Myanmar script | |
| "script-orya": "script-orya", # Odia script | |
| "script-sinh": "script-sinh", # Sinhala script | |
| "script-syrc": "script-syrc", # Syriac script | |
| "script-taml": "script-taml", # Tamil script | |
| "script-telu": "script-telu", # Telugu script | |
| "script-thaa": "script-thaa", # Thaana script | |
| "script-thai": "script-thai", # Thai script | |
| "script-tibt": "script-tibt", # Tibetan script | |
| "script-viet": "script-viet", # Vietnamese script | |
| "si": "sin", # Sinhala | |
| "sk": "slk", # Slovak | |
| "sl": "slv", # Slovenian | |
| "sd": "snd", # Sindhi | |
| "es": "spa", # Spanish | |
| "spa-old": "spa-old", # Old Spanish | |
| "sq": "sqi", # Albanian | |
| "sr": "srp", # Serbian | |
| "srp-latn": "srp-latn", # Serbian (Latin) | |
| "su": "sun", # Sundanese | |
| "sw": "swa", # Swahili | |
| "sv": "swe", # Swedish | |
| "syr": "syr", # Syriac | |
| "ta": "tam", # Tamil | |
| "tt": "tat", # Tatar | |
| "te": "tel", # Telugu | |
| "tg": "tgk", # Tajik | |
| "th": "tha", # Thai | |
| "ti": "tir", # Tigrinya | |
| "to": "ton", # Tongan | |
| "tr": "tur", # Turkish | |
| "ug": "uig", # Uyghur | |
| "uk": "ukr", # Ukrainian | |
| "ur": "urd", # Urdu | |
| "uz": "uzb", # Uzbek | |
| "uzb-cyrl": "uzb-cyrl", # Uzbek (Cyrillic) | |
| "vi": "vie", # Vietnamese | |
| "yi": "yid", # Yiddish | |
| "yo": "yor", # Yoruba | |
| } | |
| def supported_languages(): | |
| cmd = "tesseract --list-langs | grep -v osd | awk '{if(NR>1)print}'" | |
| sp = subprocess.Popen(["/bin/bash", "-c", cmd], stdout=subprocess.PIPE) | |
| tesseract_langs = [line.strip().decode("utf-8") for line in sp.stdout.readlines()] | |
| inverted_iso_dict = {v: k for k, v in iso_to_tesseract.items()} | |
| return list({tesseract_key: inverted_iso_dict[tesseract_key] for tesseract_key in tesseract_langs}.values()) | |