File size: 6,113 Bytes
da8d2e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# readability_indices.py

from nltk.tokenize import sent_tokenize, word_tokenize
import pyphen
import re
from IPython.display import display, HTML

def count_syllables(word, lang):
    if lang == 'kk':
        # Используем простой алгоритм для казахского языка
        word = word.lower()
        vowels = "аеёиоуыэюяіүұөө"
        syllables = sum(1 for char in word if char in vowels)
        return max(1, syllables)
    else:
        # Для русского и английского используем Pyphen
        dic = pyphen.Pyphen(lang=lang)
        hyphens = dic.inserted(word)
        return max(1, hyphens.count('-') + 1)

# Функции для определения сложных слов
def is_complex_word(word, lang, syllable_threshold=3):
    syllables = count_syllables(word, lang)
    return syllables >= syllable_threshold

# Функции для расчёта индексов удобочитаемости
def flesch_reading_ease(text, lang):
    sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english')
    words = word_tokenize(text, language='russian' if lang == 'ru' else 'english')
    words = [word for word in words if word.isalpha()]
    num_sentences = max(1, len(sentences))
    num_words = max(1, len(words))
    syllable_count = sum([count_syllables(word, lang) for word in words])
    asl = num_words / num_sentences  # Средняя длина предложения
    asw = syllable_count / num_words  # Среднее количество слогов в слове
    if lang == 'ru':
        fre = 206.835 - (1.3 * asl) - (60.1 * asw)
    elif lang == 'en':
        fre = 206.835 - (1.015 * asl) - (84.6 * asw)
    elif lang == 'kk':
        # Предположительные коэффициенты для казахского языка
        fre = 206.835 - (1.2 * asl) - (70 * asw)
    else:
        fre = 0
    return fre

def flesch_kincaid_grade_level(text, lang):
    sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english')
    words = word_tokenize(text, language='russian' if lang == 'ru' else 'english')
    words = [word for word in words if word.isalpha()]
    num_sentences = max(1, len(sentences))
    num_words = max(1, len(words))
    syllable_count = sum([count_syllables(word, lang) for word in words])
    asl = num_words / num_sentences
    asw = syllable_count / num_words
    if lang == 'ru':
        fkgl = (0.5 * asl) + (8.4 * asw) - 15.59
    elif lang == 'en':
        fkgl = (0.39 * asl) + (11.8 * asw) - 15.59
    elif lang == 'kk':
        fkgl = (0.5 * asl) + (9 * asw) - 13
    else:
        fkgl = 0
    return fkgl

def gunning_fog_index(text, lang):
    sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english')
    words = word_tokenize(text, language='russian' if lang == 'ru' else 'english')
    words = [word for word in words if word.isalpha()]
    num_sentences = max(1, len(sentences))
    num_words = max(1, len(words))
    complex_words = [word for word in words if is_complex_word(word, lang)]
    percentage_complex = (len(complex_words) / num_words) * 100
    asl = num_words / num_sentences
    fog_index = 0.4 * (asl + percentage_complex)
    return fog_index

def smog_index(text, lang):
    sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english')
    words = word_tokenize(text, language='russian' if lang == 'ru' else 'english')
    words = [word for word in words if word.isalpha()]
    num_sentences = len(sentences)
    complex_words = [word for word in words if is_complex_word(word, lang)]
    num_complex = len(complex_words)
    if num_sentences >= 3:
        smog = 1.0430 * ((num_complex * (30 / num_sentences)) ** 0.5) + 3.1291
    else:
        smog = 0
    return smog

# Функция для выделения сложных слов и предложений
def highlight_complex_text(text, lang):
    sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english')
    highlighted_sentences = []
    complex_words_list = []
    for sentence in sentences:
        words = word_tokenize(sentence, language='russian' if lang == 'ru' else 'english')
        words_filtered = [word for word in words if word.isalpha()]
        complex_words = [word for word in words_filtered if is_complex_word(word, lang)]
        complex_words_list.extend(complex_words)
        if len(words_filtered) > 0 and (len(complex_words) / len(words_filtered)) > 0.3:
            highlighted_sentence = f"<mark>{sentence}</mark>"
        else:
            highlighted_sentence = sentence
            for word in complex_words:
                highlighted_sentence = re.sub(r'\b{}\b'.format(re.escape(word)), f"<b>{word}</b>", highlighted_sentence)
        highlighted_sentences.append(highlighted_sentence)
    highlighted_text = ' '.join(highlighted_sentences)
    return highlighted_text, complex_words_list

# Основная функция
def analyze_text(text, lang_code):
    if lang_code not in ['ru', 'en', 'kk']:
        print('Unsupported language code. Please use "ru" for Russian, "en" for English, or "kk" for Kazakh.')
        return
    fre = flesch_reading_ease(text, lang_code)
    fkgl = flesch_kincaid_grade_level(text, lang_code)
    fog = gunning_fog_index(text, lang_code)
    smog = smog_index(text, lang_code)

    highlighted_text, complex_words = highlight_complex_text(text, lang_code)

    # Вывод результатов
    print(f"Язык: {'Русский' if lang_code == 'ru' else 'Английский' if lang_code == 'en' else 'Казахский'}")
    print(f"Индекс удобочитаемости Флеша: {fre:.2f}")
    print(f"Индекс Флеша-Кинкейда: {fkgl:.2f}")
    print(f"Индекс тумана Ганнинга: {fog:.2f}")
    print(f"Индекс SMOG: {smog:.2f}")
    print("\nСложные слова:")
    print(', '.join(set(complex_words)))
    print("\nТекст с выделениями:")
    display(HTML(highlighted_text))