1""" 2j 3A Module for fetching helpfiles, creating vectors for each and bundling 4these up in a lookup table. 5""" 6 7# System imports 8import os 9 10# Third party imports 11import tarfile 12import numpy as np 13from six.moves import urllib 14 15# local imports 16from common.keyword_vector import get_vectors, read_dictionary 17from common.constants import HELP_FILE_URL, HELP_FILE_PATH, SINGULAR_BIN, \ 18 EXTRACT_SCRIPT, KEYWORDS_FILE, HELPFILE_NPY, \ 19 VECTORS_NPY, HOME_DIR 20 21 22def fetch_tbz2_data(tbz2_url=HELP_FILE_URL, data_path=HELP_FILE_PATH, 23 file_name="helpfiles.tbz2"): 24 """ 25 Download data from a given url, extract to a path provided. 26 """ 27 if not os.path.isdir(data_path): 28 os.makedirs(data_path) 29 tbz2_path = os.path.join(data_path, file_name) 30 urllib.request.urlcleanup() 31 urllib.request.urlretrieve(tbz2_url, tbz2_path) 32 33 tbz2_file = tarfile.open(tbz2_path) 34 tbz2_file.extractall(path=data_path) 35 tbz2_file.close() 36 37 38def get_list_of_htm_files(path=os.path.join(HELP_FILE_PATH, "html")): 39 """ 40 Return a list of htm files in the given path 41 """ 42 files = [f for f in os.listdir(path) if f.endswith("htm")] 43 files.sort() 44 return files 45 46 47def extract_keywords(extract_script=None, singular_binary=None): 48 """ 49 Run Singular script to extract current keywords and save as file 50 'keywords.txt' 51 """ 52 # ensure the homedir exists 53 if not os.path.isdir(HOME_DIR): 54 os.makedirs(HOME_DIR) 55 56 if extract_script is None: 57 extract_script = EXTRACT_SCRIPT 58 59 if singular_binary is None: 60 singular_binary = SINGULAR_BIN 61 62 # extract keywords using the singular script 63 os.system(singular_binary + " -q " + extract_script + 64 " | sort | uniq > " + KEYWORDS_FILE) 65 66 # read from the file created by singular 67 dictionary = read_dictionary() 68 69 return dictionary 70 71 72def create_table(dictionary=None, attempt_cached=True): 73 """ 74 Get a list of helpfiles, and generate a word occurance vector for each. 75 """ 76 77 if dictionary is None: 78 dictionary = read_dictionary(KEYWORDS_FILE) 79 vectors = [] 80 81 if not os.path.isfile(VECTORS_NPY) or \ 82 not os.path.isfile(HELPFILE_NPY) or \ 83 not attempt_cached: 84 if not os.path.exists(HOME_DIR): 85 os.makedirs(HOME_DIR) 86 file_list = np.array(get_list_of_htm_files()) 87 np.save(HELPFILE_NPY, file_list) 88 89 filenames = [] 90 for file in file_list: 91 filename = os.path.join(HELP_FILE_PATH, "html", file) 92 filenames.append(filename) 93 filenames = np.array(filenames) 94 vectors = get_vectors(filenames, dictionary, normalise=False) 95 np.save(VECTORS_NPY, vectors) 96 else: 97 vectors = np.load(VECTORS_NPY) 98 file_list = np.load(HELPFILE_NPY) 99 100 # normalise the vectors 101 vectors = vectors / np.sqrt((vectors ** 2).sum(-1))[..., np.newaxis] 102 103 return (vectors, file_list) 104 105def init_table_on_system(extract_script=None, singular_binary=None): 106 """ 107 check whether the various files exist, and create if necessary. 108 """ 109 if not os.path.isdir(HOME_DIR): 110 os.makedirs(HOME_DIR) 111 112 # check for and download help files if necessary 113 tbz2_path = os.path.join(HELP_FILE_PATH, "helpfiles.tbz2") 114 if not os.path.isdir(HELP_FILE_PATH) or not os.path.isfile(tbz2_path): 115 fetch_tbz2_data() 116 117 # Use Singular to extract the keywords and save in a file. 118 if not os.path.isfile(KEYWORDS_FILE): 119 dictionary = extract_keywords(extract_script, singular_binary) 120 else: 121 dictionary = None 122 123 if not os.path.isfile(VECTORS_NPY) or not os.path.isfile(HELPFILE_NPY): 124 create_table(dictionary=dictionary, 125 attempt_cached=False) 126 127def is_lookup_initialised(): 128 """ 129 Check whether the various files exist, return True if so, False 130 otherwise. 131 """ 132 retvalue = True 133 tbz2_path = os.path.join(HELP_FILE_PATH, "helpfiles.tbz2") 134 if not os.path.isdir(HELP_FILE_PATH) or not os.path.isfile(tbz2_path): 135 retvalue = False 136 if not os.path.isfile(KEYWORDS_FILE): 137 retvalue = False 138 if not os.path.isdir(HOME_DIR) or \ 139 not os.path.isfile(VECTORS_NPY) or \ 140 not os.path.isfile(HELPFILE_NPY): 141 retvalue = False 142 143 return retvalue 144