1# 한국어기초사전 불규칙용언 찾기 2 3import sys 4import datetime 5import tzlocal 6import unicodedata 7import yaml 8 9L_RIEUL = '\u1105' 10V_A = '\u1161' 11V_AE = '\u1162' 12V_EO = '\u1165' 13V_E = '\u1166' 14V_EU = '\u1173' 15T_KIYEOK = '\u11A8' 16T_NIEUN = '\u11AB' 17T_TIKEUT = '\u11AE' 18T_RIEUL = '\u11AF' 19T_PIEUP = '\u11B8' 20T_SIOS = '\u11BA' 21T_HIEUH = '\u11C2' 22 23def is_jongseong(j): 24 return ord(j) >= ord(T_KIYEOK) and ord(j) <= ord(T_HIEUH) 25 26def detect_irregular(word, inflections): 27 if not word.endswith('다'): 28 print('어라? %s' % word) 29 return None 30 31 nfd = unicodedata.normalize('NFD', word[:-1]) 32 33 # 불규칙 활용이 가능한 형태마다 가능한 불규칙형태와 규칙형태를 34 # 만들어서 '활용'에 들어있는지 확인한다. 35 36 # - '어/아'같은 어미 같은 경우 모음조화에 따라 붙여야 맞겠지만 활용 37 # 정보에 어떻게든 하나만 들어 있으면 확인되니 자세한 구현은 넘어간다 38 39 if nfd[-2:] == L_RIEUL + V_EU: 40 # '르' 앞 음절이 종성으로 끝나면 규칙? 41 if unicodedata.normalize('NFC', nfd[:-2] + T_RIEUL) + '러' in inflections: 42 result = '르불규칙' 43 elif unicodedata.normalize('NFC', nfd[:-2] + T_RIEUL) + '라' in inflections: 44 result = '르불규칙' 45 elif word[:-1] + '러' in inflections: 46 result = '러불규칙' 47 elif word[:-1] + '라' in inflections: 48 result = '러불규칙' 49 elif unicodedata.normalize('NFC', nfd[:-1] + V_EO) in inflections: 50 result = '규칙' 51 elif unicodedata.normalize('NFC', nfd[:-1] + V_A) in inflections: 52 result = '규칙' 53 elif is_jongseong(nfd[-3]): 54 # '-ㄹ르다'처럼 '르' 앞에 종성이 있으면 르불규칙이 될 수 없고 55 # '-ㄹ르러' 처럼 되기도 어려워 보이므로 규칙활용 '-ㄹ러'일 것이다 56 result = '규칙' 57 else: 58 result = '르/러불규칙 미확정' 59 elif nfd[-1] == T_TIKEUT: 60 if unicodedata.normalize('NFC', nfd[:-1] + T_RIEUL) + '어' in inflections: 61 result = 'ㄷ불규칙' 62 elif unicodedata.normalize('NFC', nfd[:-1] + T_RIEUL) + '아' in inflections: 63 result = 'ㄷ불규칙' 64 elif word[:-1] + '어' in inflections: 65 result = '규칙' 66 elif word[:-1] + '아' in inflections: 67 result = '규칙' 68 else: 69 result = 'ㄷ불규칙 미확정' 70 elif nfd[-1] == T_PIEUP: 71 if unicodedata.normalize('NFC', nfd[:-1]) + '워' in inflections: 72 result = 'ㅂ불규칙' 73 elif unicodedata.normalize('NFC', nfd[:-1]) + '와' in inflections: 74 result = 'ㅂ불규칙' 75 elif unicodedata.normalize('NFC', nfd[:-1]) + '운' in inflections: 76 result = 'ㅂ불규칙' 77 elif word[:-1] + '어' in inflections: 78 result = '규칙' 79 elif word[:-1] + '아' in inflections: 80 result = '규칙' 81 else: 82 result = 'ㅂ불규칙 미확정' 83 pass 84 elif nfd[-1] == T_SIOS: 85 if unicodedata.normalize('NFC', nfd[:-1]) + '어' in inflections: 86 result = 'ㅅ불규칙' 87 elif unicodedata.normalize('NFC', nfd[:-1]) + '아' in inflections: 88 result = 'ㅅ불규칙' 89 elif word[:-1] + '어' in inflections: 90 result = '규칙' 91 elif word[:-1] + '아' in inflections: 92 result = '규칙' 93 else: 94 result = 'ㅅ불규칙 미확정' 95 elif nfd[-1] == T_HIEUH: 96 if unicodedata.normalize('NFC', nfd[:-1] + T_NIEUN) in inflections: 97 result = 'ㅎ불규칙' 98 elif unicodedata.normalize('NFC', nfd[:-2] + V_AE) in inflections: 99 result = 'ㅎ불규칙' 100 elif unicodedata.normalize('NFC', nfd[:-2] + V_E) in inflections: 101 result = 'ㅎ불규칙' 102 elif word[:-1] + '은' in inflections: 103 result = '규칙' 104 elif word[:-1] + '아' in inflections: 105 result = '규칙' 106 elif word[:-1] + '어' in inflections: 107 result = '규칙' 108 elif word[-1] == '렇' or word[-1] == '랗': 109 # '-렇다', '-랗다'는 불규칙 110 result = 'ㅎ불규칙' 111 else: 112 result = 'ㅎ불규칙 미확정' 113 else: 114 result = None 115 116 return result 117 118def process_file(filename): 119 k = yaml.load(open(filename).read()) 120 if '한국어기초사전' not in k['imported']: 121 return 122 imported = k['imported']['한국어기초사전'] 123 if imported['품사'] not in ['형용사','동사','보조 형용사','보조 동사']: 124 return 125 126 # 이미 설정되어 있는지 확인 127 # if '불규칙 활용' in k['processed']['맞춤법 검사']: 128 # return 129 130 word = k['processed']['맞춤법 검사']['표제어'] 131 if '활용' in imported: 132 inflections = [dd['형태'] for dd in imported['활용']] 133 else: 134 inflections = [] 135 136 result = detect_irregular(word, inflections) 137 if not result: 138 return 139 140 if result.endswith('미확정'): 141 print('단어: %s (%s), 활용: %s' % (word, result, ', '.join(inflections))) 142 143 print(filename) 144 k['processed']['맞춤법 검사']['불규칙 활용'] = result 145 print('불규칙:' + result) 146 with open(filename, 'w') as fp: 147 fp.write(yaml.dump(k, allow_unicode=True, default_flow_style=False, indent=2)) 148 149 150if __name__ == '__main__': 151 filenames = sys.argv[1:] 152 for filename in filenames: 153 process_file(filename) 154