1"""
2j
3A Module for fetching helpfiles, creating vectors for each and bundling
4these up in a lookup table.
5"""
6
7# System imports
8import os
9
10# Third party imports
11import tarfile
12import numpy as np
13from six.moves import urllib
14
15# local imports
16from common.keyword_vector import get_vectors, read_dictionary
17from common.constants import HELP_FILE_URL, HELP_FILE_PATH, SINGULAR_BIN, \
18                        EXTRACT_SCRIPT, KEYWORDS_FILE, HELPFILE_NPY, \
19                        VECTORS_NPY, HOME_DIR
20
21
22def fetch_tbz2_data(tbz2_url=HELP_FILE_URL, data_path=HELP_FILE_PATH,
23                    file_name="helpfiles.tbz2"):
24    """
25    Download data from a given url, extract to a path provided.
26    """
27    if not os.path.isdir(data_path):
28        os.makedirs(data_path)
29    tbz2_path = os.path.join(data_path, file_name)
30    urllib.request.urlcleanup()
31    urllib.request.urlretrieve(tbz2_url, tbz2_path)
32
33    tbz2_file = tarfile.open(tbz2_path)
34    tbz2_file.extractall(path=data_path)
35    tbz2_file.close()
36
37
38def get_list_of_htm_files(path=os.path.join(HELP_FILE_PATH, "html")):
39    """
40    Return a list of htm files in the given path
41    """
42    files = [f for f in os.listdir(path) if f.endswith("htm")]
43    files.sort()
44    return files
45
46
47def extract_keywords(extract_script=None, singular_binary=None):
48    """
49    Run Singular script to extract current keywords and save as file
50    'keywords.txt'
51    """
52    # ensure the homedir exists
53    if not os.path.isdir(HOME_DIR):
54        os.makedirs(HOME_DIR)
55
56    if extract_script is None:
57        extract_script = EXTRACT_SCRIPT
58
59    if singular_binary is None:
60        singular_binary = SINGULAR_BIN
61
62    # extract keywords using the singular script
63    os.system(singular_binary + " -q " + extract_script +
64            " | sort | uniq > " + KEYWORDS_FILE)
65
66    # read from the file created by singular
67    dictionary = read_dictionary()
68
69    return dictionary
70
71
72def create_table(dictionary=None, attempt_cached=True):
73    """
74    Get a list of helpfiles, and generate a word occurance vector for each.
75    """
76
77    if dictionary is None:
78        dictionary = read_dictionary(KEYWORDS_FILE)
79    vectors = []
80
81    if not os.path.isfile(VECTORS_NPY) or \
82            not os.path.isfile(HELPFILE_NPY) or \
83            not attempt_cached:
84        if not os.path.exists(HOME_DIR):
85            os.makedirs(HOME_DIR)
86        file_list = np.array(get_list_of_htm_files())
87        np.save(HELPFILE_NPY, file_list)
88
89        filenames = []
90        for file in file_list:
91            filename = os.path.join(HELP_FILE_PATH, "html", file)
92            filenames.append(filename)
93        filenames = np.array(filenames)
94        vectors = get_vectors(filenames, dictionary, normalise=False)
95        np.save(VECTORS_NPY, vectors)
96    else:
97        vectors = np.load(VECTORS_NPY)
98        file_list = np.load(HELPFILE_NPY)
99
100    # normalise the vectors
101    vectors = vectors / np.sqrt((vectors ** 2).sum(-1))[..., np.newaxis]
102
103    return (vectors, file_list)
104
105def init_table_on_system(extract_script=None, singular_binary=None):
106    """
107    check whether the various files exist, and create if necessary.
108    """
109    if not os.path.isdir(HOME_DIR):
110        os.makedirs(HOME_DIR)
111
112    # check for and download help files if necessary
113    tbz2_path = os.path.join(HELP_FILE_PATH, "helpfiles.tbz2")
114    if not os.path.isdir(HELP_FILE_PATH) or not os.path.isfile(tbz2_path):
115        fetch_tbz2_data()
116
117    # Use Singular to extract the keywords and save in a file.
118    if not os.path.isfile(KEYWORDS_FILE):
119        dictionary = extract_keywords(extract_script, singular_binary)
120    else:
121        dictionary = None
122
123    if not os.path.isfile(VECTORS_NPY) or not os.path.isfile(HELPFILE_NPY):
124        create_table(dictionary=dictionary,
125                     attempt_cached=False)
126
127def is_lookup_initialised():
128    """
129    Check whether the various files exist, return True if so, False
130    otherwise.
131    """
132    retvalue = True
133    tbz2_path = os.path.join(HELP_FILE_PATH, "helpfiles.tbz2")
134    if not os.path.isdir(HELP_FILE_PATH) or not os.path.isfile(tbz2_path):
135        retvalue = False
136    if not os.path.isfile(KEYWORDS_FILE):
137        retvalue = False
138    if not os.path.isdir(HOME_DIR) or \
139            not os.path.isfile(VECTORS_NPY) or \
140            not os.path.isfile(HELPFILE_NPY):
141        retvalue = False
142
143    return retvalue
144