1#!/bin/sh -e
2
3fail() {
4    echo "Error: $1"
5    exit 1
6}
7
8notExists() {
9    [ ! -f "$1" ]
10}
11
12log() {
13    if [ "${VERBOSITY}" = "-v 3" ]; then
14        echo "$@"
15    fi
16}
17
18abspath() {
19    if [ -d "$1" ]; then
20        (cd "$1"; pwd)
21    elif [ -f "$1" ]; then
22        if [ -z "${1##*/*}" ]; then
23            echo "$(cd "${1%/*}"; pwd)/${1##*/}"
24        else
25            echo "$(pwd)/$1"
26        fi
27    elif [ -d "$(dirname "$1")" ]; then
28        echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")"
29    fi
30}
31
32# check number of input variables
33[ "$#" -ne 6 ] && echo "Please provide <i:oldSequenceDB> <i:newSequenceDB> <i:oldClusteringDB> <o:newMappedSequenceDB> <o:newClusteringDB> <o:tmpDir>" && exit 1
34# check if files exist
35[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1
36[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1
37[ ! -f "$3.dbtype" ] && echo "$3.dbtype not found!" && exit 1
38[   -f "$5.dbtype" ] && echo "$5.dbtype exists already!" && exit 1
39[ ! -d "$6" ] && echo "tmp directory $6 not found!" && exit 1
40
41OLDDB="$(abspath "$1")"
42NEWDB="$(abspath "$2")"
43OLDCLUST="$(abspath "$3")"
44NEWMAPDB="$(abspath "$4")"
45NEWCLUST="$(abspath "$5")"
46TMP_PATH="$(abspath "$6")"
47
48if notExists "${TMP_PATH}/removedSeqs"; then
49    # shellcheck disable=SC2086
50    "$MMSEQS" diffseqdbs "$OLDDB" "$NEWDB" "${TMP_PATH}/removedSeqs" "${TMP_PATH}/mappingSeqs" "${TMP_PATH}/newSeqs" ${DIFF_PAR} \
51        || fail "Diff died"
52fi
53
54if [ ! -s "${TMP_PATH}/mappingSeqs" ]; then
55    cat <<WARN
56WARNING: There are no common sequences between $OLDDB and $NEWDB.
57If you aim to add the sequences of $NEWDB to your previous clustering $OLDCLUST, you can run:
58
59mmseqs concatdbs \"$OLDDB\" \"$NEWDB\" \"${OLDDB}.withNewSequences\"
60mmseqs concatdbs \"${OLDDB}_h\" \"${NEWDB}_h\" \"${OLDDB}.withNewSequences_h\"
61mmseqs clusterupdate \"$OLDDB\" \"${OLDDB}.withNewSequences\" \"$OLDCLUST\" \"$NEWCLUST\" \"${TMP_PATH}\"
62WARN
63    rm -f "${TMP_PATH}/removedSeqs" "${TMP_PATH}/mappingSeqs" "${TMP_PATH}/newSeqs"
64    exit 1
65fi
66
67if [ -s "${TMP_PATH}/removedSeqs" ]; then
68    if [ -n "${RECOVER_DELETED}" ]; then
69        log "=== Recover removed sequences"
70        if notExists "${TMP_PATH}/OLDDB.removedMapping"; then
71            HIGHESTID="$(awk '$1 > max { max = $1 } END { print max }' "${NEWDB}.index")"
72            awk -v highest="$HIGHESTID" 'BEGIN { start=highest+1 } { print $1"\t"start; start=start+1; }' \
73                "${TMP_PATH}/removedSeqs" > "${TMP_PATH}/OLDDB.removedMapping"
74            cat "${TMP_PATH}/OLDDB.removedMapping" >> "${TMP_PATH}/mappingSeqs"
75        fi
76
77        if notExists "${TMP_PATH}/NEWDB.withOld.dbtype"; then
78            # shellcheck disable=SC2086
79            "$MMSEQS" renamedbkeys "${TMP_PATH}/OLDDB.removedMapping" "${OLDDB}" "${TMP_PATH}/OLDDB.removedDb" --subdb-mode 1 ${VERBOSITY} \
80                || fail "renamedbkeys died"
81            # shellcheck disable=SC2086
82            "$MMSEQS" concatdbs "$NEWDB" "${TMP_PATH}/OLDDB.removedDb" "${TMP_PATH}/NEWDB.withOld" --preserve-keys --threads 1 ${VERBOSITY} \
83                || fail "concatdbs died"
84            # shellcheck disable=SC2086
85            "$MMSEQS" concatdbs "${NEWDB}_h" "${TMP_PATH}/OLDDB.removedDb_h" "${TMP_PATH}/NEWDB.withOld_h" --preserve-keys --threads 1 ${VERBOSITY} \
86                || fail "concatdbs died"
87        fi
88        NEWDB="${TMP_PATH}/NEWDB.withOld"
89
90        if [ -n "$REMOVE_TMP" ]; then
91            echo "Remove temporary files 1/3"
92            rm -f "${TMP_PATH}/OLDDB.removedMapping"
93            # shellcheck disable=SC2086
94            "$MMSEQS" rmdb "${TMP_PATH}/OLDDB.removedDb" ${VERBOSITY}
95        fi
96    else
97        if notExists "${TMP_PATH}/OLCLUST.withoutDeletedKeys.dbtype"; then
98            # shellcheck disable=SC2086
99            "$MMSEQS" createsubdb "${TMP_PATH}/mappingSeqs" "${OLDCLUST}" "${TMP_PATH}/OLCLUST.withoutDeletedKeys" --subdb-mode 1 ${VERBOSITY} \
100                || fail "createsubdb died"
101        fi
102        if notExists "${TMP_PATH}/OLCLUST.withoutDeleted.dbtype"; then
103            # shellcheck disable=SC2086
104            "$MMSEQS" filterdb "${TMP_PATH}/OLCLUST.withoutDeletedKeys" "${TMP_PATH}/OLCLUST.withoutDeleted" --filter-file "${TMP_PATH}/removedSeqs" --positive-filter ${THREADS_PAR} \
105                || fail "filterdb died"
106        fi
107        OLDCLUST="${TMP_PATH}/OLCLUST.withoutDeleted"
108    fi
109fi
110
111if notExists "${TMP_PATH}/newMappingSeqs"; then
112    log "=== Update new sequences with old keys"
113    MAXID="$(awk '$1 > max { max = $1 } END { print max }' "${OLDDB}.index" "${NEWDB}.index")"
114    awk -v highest="$MAXID" 'BEGIN { start=highest+1 } { print $1"\t"start; start=start+1; }' \
115        "${TMP_PATH}/newSeqs" > "${TMP_PATH}/newSeqs.mapped"
116    awk '{ print $2"\t"$1 }' "${TMP_PATH}/mappingSeqs" > "${TMP_PATH}/mappingSeqs.reverse"
117    cat "${TMP_PATH}/mappingSeqs.reverse" "${TMP_PATH}/newSeqs.mapped" > "${TMP_PATH}/newMappingSeqs"
118    awk '{ print $2 }' "${TMP_PATH}/newSeqs.mapped" > "${TMP_PATH}/newSeqs"
119fi
120
121if notExists "${NEWMAPDB}.dbtype"; then
122    # shellcheck disable=SC2086
123    "$MMSEQS" renamedbkeys "${TMP_PATH}/newMappingSeqs" "${NEWDB}" "${NEWMAPDB}" ${VERBOSITY} \
124        || fail "renamedbkeys died"
125fi
126NEWDB="${NEWMAPDB}"
127
128if notExists "${TMP_PATH}/NEWDB.newSeqs.dbtype"; then
129    log "=== Filter out new from old sequences"
130    # shellcheck disable=SC2086
131    "$MMSEQS" createsubdb "${TMP_PATH}/newSeqs" "$NEWDB" "${TMP_PATH}/NEWDB.newSeqs" ${VERBOSITY} --subdb-mode 1 \
132        || fail "createsubdb died"
133fi
134
135if notExists "${TMP_PATH}/OLDDB.repSeq.dbtype"; then
136    log "=== Extract representative sequences"
137    # shellcheck disable=SC2086
138    "$MMSEQS" result2repseq "$OLDDB" "$OLDCLUST" "${TMP_PATH}/OLDDB.repSeq" ${RESULT2REPSEQ_PAR} \
139        || fail "result2repseq died"
140fi
141
142if notExists "${TMP_PATH}/newSeqsHits.dbtype"; then
143    log "=== Search new sequences against representatives"
144    # shellcheck disable=SC2086
145    "$MMSEQS" search "${TMP_PATH}/NEWDB.newSeqs" "${TMP_PATH}/OLDDB.repSeq" "${TMP_PATH}/newSeqsHits" "${TMP_PATH}/search" ${SEARCH_PAR} \
146        || fail "search died"
147fi
148
149if notExists "${TMP_PATH}/newSeqsHits.swapped.all.dbtype"; then
150    # shellcheck disable=SC2086
151    "$MMSEQS" swapdb "${TMP_PATH}/newSeqsHits" "${TMP_PATH}/newSeqsHits.swapped.all" ${THREADS_PAR} \
152        || fail "swapdb died"
153    awk '$3 > 1 { print 1; exit; }' "${TMP_PATH}/newSeqsHits.swapped.all.index" > "${TMP_PATH}/newSeqsHits.swapped.hasHits"
154fi
155
156if [ -s "${TMP_PATH}/newSeqsHits.swapped.hasHits" ] && notExists "${TMP_PATH}/newSeqsHits.swapped.dbtype"; then
157    # shellcheck disable=SC2086
158    "$MMSEQS" filterdb "${TMP_PATH}/newSeqsHits.swapped.all" "${TMP_PATH}/newSeqsHits.swapped" --trim-to-one-column ${THREADS_PAR} \
159        || fail "filterdb died"
160fi
161
162UPDATEDCLUST="${TMP_PATH}/updatedClust"
163if [ -f "${TMP_PATH}/newSeqsHits.swapped.dbtype" ]; then
164    if notExists "${TMP_PATH}/updatedClust.dbtype"; then
165        log "=== Merge found sequences with previous clustering"
166        # shellcheck disable=SC2086
167        "$MMSEQS" mergedbs "$OLDCLUST" "${TMP_PATH}/updatedClust" "$OLDCLUST" "${TMP_PATH}/newSeqsHits.swapped" ${VERBOSITY} \
168            || fail "mergedbs died"
169    fi
170else
171    UPDATEDCLUST="$OLDCLUST"
172fi
173
174if notExists "${TMP_PATH}/toBeClusteredSeparately.dbtype"; then
175    log "=== Extract unmapped sequences"
176    awk '$3 == 1 {print $1}' "${TMP_PATH}/newSeqsHits.index" > "${TMP_PATH}/noHitSeqList"
177    # shellcheck disable=SC2086
178    "$MMSEQS" createsubdb "${TMP_PATH}/noHitSeqList" "$NEWDB" "${TMP_PATH}/toBeClusteredSeparately" ${VERBOSITY} --subdb-mode 1 \
179        || fail "createsubdb of not hit seq. died"
180fi
181
182if notExists "${TMP_PATH}/newClusters.dbtype" && [ -s "${TMP_PATH}/toBeClusteredSeparately.index" ]; then
183    log "=== Cluster separately the singleton sequences"
184    # shellcheck disable=SC2086
185    "$MMSEQS" cluster "${TMP_PATH}/toBeClusteredSeparately" "${TMP_PATH}/newClusters" "${TMP_PATH}/cluster" ${CLUST_PAR} \
186        || fail "cluster of new seq. died"
187fi
188
189if [ -f "${TMP_PATH}/newClusters.dbtype" ]; then
190    if notExists "$NEWCLUST"; then
191        log "=== Merge updated clustering with new clusters"
192        # shellcheck disable=SC2086
193        "$MMSEQS" concatdbs "${UPDATEDCLUST}" "${TMP_PATH}/newClusters" "$NEWCLUST" --preserve-keys ${THREADS_PAR} \
194            || fail "concatdbs died"
195    fi
196else
197    # shellcheck disable=SC2086
198    "$MMSEQS" mvdb "${UPDATEDCLUST}" "$NEWCLUST" ${VERBOSITY}
199fi
200
201if [ -n "$REMOVE_TMP" ]; then
202    rm -f "${TMP_PATH}/newSeqs.mapped" "${TMP_PATH}/mappingSeqs.reverse" "${TMP_PATH}/newMappingSeqs"
203    rm -f "${TMP_PATH}/noHitSeqList" "${TMP_PATH}/mappingSeqs" "${TMP_PATH}/newSeqs" "${TMP_PATH}/removedSeqs"
204    rm -f "${TMP_PATH}/newSeqsHits.swapped.hasHits"
205
206    if [ -n "${RECOVER_DELETED}" ]; then
207        # shellcheck disable=SC2086
208        "$MMSEQS" rmdb "${TMP_PATH}/NEWDB.withOld" ${VERBOSITY}
209        # shellcheck disable=SC2086
210        "$MMSEQS" rmdb "${TMP_PATH}/NEWDB.withOld_h" ${VERBOSITY}
211    else
212        # shellcheck disable=SC2086
213        "$MMSEQS" rmdb "${TMP_PATH}/OLCLUST.withoutDeletedKeys" ${VERBOSITY}
214        # shellcheck disable=SC2086
215        "$MMSEQS" rmdb "${TMP_PATH}/OLCLUST.withoutDeleted" ${VERBOSITY}
216    fi
217
218    # shellcheck disable=SC2086
219    "$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits.swapped" ${VERBOSITY}
220    # shellcheck disable=SC2086
221    "$MMSEQS" rmdb "${TMP_PATH}/newClusters" ${VERBOSITY}
222    # shellcheck disable=SC2086
223    "$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits" ${VERBOSITY}
224    # shellcheck disable=SC2086
225    "$MMSEQS" rmdb "${TMP_PATH}/toBeClusteredSeparately" ${VERBOSITY}
226    # shellcheck disable=SC2086
227    "$MMSEQS" rmdb "${TMP_PATH}/NEWDB.newSeqs" ${VERBOSITY}
228    # shellcheck disable=SC2086
229    "$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits.swapped.all" ${VERBOSITY}
230    # shellcheck disable=SC2086
231    "$MMSEQS" rmdb "${TMP_PATH}/OLDDB.repSeq" ${VERBOSITY}
232    # shellcheck disable=SC2086
233    "$MMSEQS" rmdb "${TMP_PATH}/updatedClust" ${VERBOSITY}
234
235    rm -rf "${TMP_PATH}/search" "${TMP_PATH}/cluster"
236    rm -f "${TMP_PATH}/update_clustering.sh"
237fi
238