1# 한국어기초사전 불규칙용언 찾기
2
3import sys
4import datetime
5import tzlocal
6import unicodedata
7import yaml
8
9L_RIEUL = '\u1105'
10V_A = '\u1161'
11V_AE = '\u1162'
12V_EO = '\u1165'
13V_E = '\u1166'
14V_EU = '\u1173'
15T_KIYEOK = '\u11A8'
16T_NIEUN = '\u11AB'
17T_TIKEUT = '\u11AE'
18T_RIEUL = '\u11AF'
19T_PIEUP = '\u11B8'
20T_SIOS = '\u11BA'
21T_HIEUH = '\u11C2'
22
23def is_jongseong(j):
24    return ord(j) >= ord(T_KIYEOK) and ord(j) <= ord(T_HIEUH)
25
26def detect_irregular(word, inflections):
27    if not word.endswith('다'):
28        print('어라? %s' % word)
29        return None
30
31    nfd = unicodedata.normalize('NFD', word[:-1])
32
33    # 불규칙 활용이 가능한 형태마다 가능한 불규칙형태와 규칙형태를
34    # 만들어서 '활용'에 들어있는지 확인한다.
35
36    # - '어/아'같은 어미 같은 경우 모음조화에 따라 붙여야 맞겠지만 활용
37    # 정보에 어떻게든 하나만 들어 있으면 확인되니 자세한 구현은 넘어간다
38
39    if nfd[-2:] == L_RIEUL + V_EU:
40        #  '르' 앞 음절이 종성으로 끝나면 규칙?
41        if unicodedata.normalize('NFC', nfd[:-2] + T_RIEUL) + '러' in inflections:
42            result = '르불규칙'
43        elif unicodedata.normalize('NFC', nfd[:-2] + T_RIEUL) + '라' in inflections:
44            result = '르불규칙'
45        elif word[:-1] + '러' in inflections:
46            result = '러불규칙'
47        elif word[:-1] + '라' in inflections:
48            result = '러불규칙'
49        elif unicodedata.normalize('NFC', nfd[:-1] + V_EO) in inflections:
50            result = '규칙'
51        elif unicodedata.normalize('NFC', nfd[:-1] + V_A) in inflections:
52            result = '규칙'
53        elif is_jongseong(nfd[-3]):
54            # '-ㄹ르다'처럼 '르' 앞에 종성이 있으면 르불규칙이 될 수 없고
55            # '-ㄹ르러' 처럼 되기도 어려워 보이므로 규칙활용 '-ㄹ러'일 것이다
56            result = '규칙'
57        else:
58            result = '르/러불규칙 미확정'
59    elif nfd[-1] == T_TIKEUT:
60        if unicodedata.normalize('NFC', nfd[:-1] + T_RIEUL) + '어' in inflections:
61            result = 'ㄷ불규칙'
62        elif unicodedata.normalize('NFC', nfd[:-1] + T_RIEUL) + '아' in inflections:
63            result = 'ㄷ불규칙'
64        elif word[:-1] + '어' in inflections:
65            result = '규칙'
66        elif word[:-1] + '아' in inflections:
67            result = '규칙'
68        else:
69            result = 'ㄷ불규칙 미확정'
70    elif nfd[-1] == T_PIEUP:
71        if unicodedata.normalize('NFC', nfd[:-1]) + '워' in inflections:
72            result = 'ㅂ불규칙'
73        elif unicodedata.normalize('NFC', nfd[:-1]) + '와' in inflections:
74            result = 'ㅂ불규칙'
75        elif unicodedata.normalize('NFC', nfd[:-1]) + '운' in inflections:
76            result = 'ㅂ불규칙'
77        elif word[:-1] + '어' in inflections:
78            result = '규칙'
79        elif word[:-1] + '아' in inflections:
80            result = '규칙'
81        else:
82            result = 'ㅂ불규칙 미확정'
83        pass
84    elif nfd[-1] == T_SIOS:
85        if unicodedata.normalize('NFC', nfd[:-1]) + '어' in inflections:
86            result = 'ㅅ불규칙'
87        elif unicodedata.normalize('NFC', nfd[:-1]) + '아' in inflections:
88            result = 'ㅅ불규칙'
89        elif word[:-1] + '어' in inflections:
90            result = '규칙'
91        elif word[:-1] + '아' in inflections:
92            result = '규칙'
93        else:
94            result = 'ㅅ불규칙 미확정'
95    elif nfd[-1] == T_HIEUH:
96        if unicodedata.normalize('NFC', nfd[:-1] + T_NIEUN) in inflections:
97            result = 'ㅎ불규칙'
98        elif unicodedata.normalize('NFC', nfd[:-2] + V_AE) in inflections:
99            result = 'ㅎ불규칙'
100        elif unicodedata.normalize('NFC', nfd[:-2] + V_E) in inflections:
101            result = 'ㅎ불규칙'
102        elif word[:-1] + '은' in inflections:
103            result = '규칙'
104        elif word[:-1] + '아' in inflections:
105            result = '규칙'
106        elif word[:-1] + '어' in inflections:
107            result = '규칙'
108        elif word[-1] == '렇' or word[-1] == '랗':
109            # '-렇다', '-랗다'는 불규칙
110            result = 'ㅎ불규칙'
111        else:
112            result = 'ㅎ불규칙 미확정'
113    else:
114        result = None
115
116    return result
117
118def process_file(filename):
119    k = yaml.load(open(filename).read())
120    if '한국어기초사전' not in k['imported']:
121        return
122    imported = k['imported']['한국어기초사전']
123    if imported['품사'] not in ['형용사','동사','보조 형용사','보조 동사']:
124        return
125
126    # 이미 설정되어 있는지 확인
127    # if '불규칙 활용' in k['processed']['맞춤법 검사']:
128    #     return
129
130    word = k['processed']['맞춤법 검사']['표제어']
131    if '활용' in imported:
132        inflections = [dd['형태'] for dd in imported['활용']]
133    else:
134        inflections = []
135
136    result = detect_irregular(word, inflections)
137    if not result:
138        return
139
140    if result.endswith('미확정'):
141        print('단어: %s (%s), 활용: %s' % (word, result, ', '.join(inflections)))
142
143    print(filename)
144    k['processed']['맞춤법 검사']['불규칙 활용'] = result
145    print('불규칙:' + result)
146    with open(filename, 'w') as fp:
147        fp.write(yaml.dump(k, allow_unicode=True, default_flow_style=False, indent=2))
148
149
150if __name__ == '__main__':
151    filenames = sys.argv[1:]
152    for filename in filenames:
153        process_file(filename)
154