1#!/bin/sh -e 2 3fail() { 4 echo "Error: $1" 5 exit 1 6} 7 8notExists() { 9 [ ! -f "$1" ] 10} 11 12log() { 13 if [ "${VERBOSITY}" = "-v 3" ]; then 14 echo "$@" 15 fi 16} 17 18abspath() { 19 if [ -d "$1" ]; then 20 (cd "$1"; pwd) 21 elif [ -f "$1" ]; then 22 if [ -z "${1##*/*}" ]; then 23 echo "$(cd "${1%/*}"; pwd)/${1##*/}" 24 else 25 echo "$(pwd)/$1" 26 fi 27 elif [ -d "$(dirname "$1")" ]; then 28 echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" 29 fi 30} 31 32# check number of input variables 33[ "$#" -ne 6 ] && echo "Please provide <i:oldSequenceDB> <i:newSequenceDB> <i:oldClusteringDB> <o:newMappedSequenceDB> <o:newClusteringDB> <o:tmpDir>" && exit 1 34# check if files exist 35[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1 36[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1 37[ ! -f "$3.dbtype" ] && echo "$3.dbtype not found!" && exit 1 38[ -f "$5.dbtype" ] && echo "$5.dbtype exists already!" && exit 1 39[ ! -d "$6" ] && echo "tmp directory $6 not found!" && exit 1 40 41OLDDB="$(abspath "$1")" 42NEWDB="$(abspath "$2")" 43OLDCLUST="$(abspath "$3")" 44NEWMAPDB="$(abspath "$4")" 45NEWCLUST="$(abspath "$5")" 46TMP_PATH="$(abspath "$6")" 47 48if notExists "${TMP_PATH}/removedSeqs"; then 49 # shellcheck disable=SC2086 50 "$MMSEQS" diffseqdbs "$OLDDB" "$NEWDB" "${TMP_PATH}/removedSeqs" "${TMP_PATH}/mappingSeqs" "${TMP_PATH}/newSeqs" ${DIFF_PAR} \ 51 || fail "Diff died" 52fi 53 54if [ ! -s "${TMP_PATH}/mappingSeqs" ]; then 55 cat <<WARN 56WARNING: There are no common sequences between $OLDDB and $NEWDB. 57If you aim to add the sequences of $NEWDB to your previous clustering $OLDCLUST, you can run: 58 59mmseqs concatdbs \"$OLDDB\" \"$NEWDB\" \"${OLDDB}.withNewSequences\" 60mmseqs concatdbs \"${OLDDB}_h\" \"${NEWDB}_h\" \"${OLDDB}.withNewSequences_h\" 61mmseqs clusterupdate \"$OLDDB\" \"${OLDDB}.withNewSequences\" \"$OLDCLUST\" \"$NEWCLUST\" \"${TMP_PATH}\" 62WARN 63 rm -f "${TMP_PATH}/removedSeqs" "${TMP_PATH}/mappingSeqs" "${TMP_PATH}/newSeqs" 64 exit 1 65fi 66 67if [ -s "${TMP_PATH}/removedSeqs" ]; then 68 if [ -n "${RECOVER_DELETED}" ]; then 69 log "=== Recover removed sequences" 70 if notExists "${TMP_PATH}/OLDDB.removedMapping"; then 71 HIGHESTID="$(awk '$1 > max { max = $1 } END { print max }' "${NEWDB}.index")" 72 awk -v highest="$HIGHESTID" 'BEGIN { start=highest+1 } { print $1"\t"start; start=start+1; }' \ 73 "${TMP_PATH}/removedSeqs" > "${TMP_PATH}/OLDDB.removedMapping" 74 cat "${TMP_PATH}/OLDDB.removedMapping" >> "${TMP_PATH}/mappingSeqs" 75 fi 76 77 if notExists "${TMP_PATH}/NEWDB.withOld.dbtype"; then 78 # shellcheck disable=SC2086 79 "$MMSEQS" renamedbkeys "${TMP_PATH}/OLDDB.removedMapping" "${OLDDB}" "${TMP_PATH}/OLDDB.removedDb" --subdb-mode 1 ${VERBOSITY} \ 80 || fail "renamedbkeys died" 81 # shellcheck disable=SC2086 82 "$MMSEQS" concatdbs "$NEWDB" "${TMP_PATH}/OLDDB.removedDb" "${TMP_PATH}/NEWDB.withOld" --preserve-keys --threads 1 ${VERBOSITY} \ 83 || fail "concatdbs died" 84 # shellcheck disable=SC2086 85 "$MMSEQS" concatdbs "${NEWDB}_h" "${TMP_PATH}/OLDDB.removedDb_h" "${TMP_PATH}/NEWDB.withOld_h" --preserve-keys --threads 1 ${VERBOSITY} \ 86 || fail "concatdbs died" 87 fi 88 NEWDB="${TMP_PATH}/NEWDB.withOld" 89 90 if [ -n "$REMOVE_TMP" ]; then 91 echo "Remove temporary files 1/3" 92 rm -f "${TMP_PATH}/OLDDB.removedMapping" 93 # shellcheck disable=SC2086 94 "$MMSEQS" rmdb "${TMP_PATH}/OLDDB.removedDb" ${VERBOSITY} 95 fi 96 else 97 if notExists "${TMP_PATH}/OLCLUST.withoutDeletedKeys.dbtype"; then 98 # shellcheck disable=SC2086 99 "$MMSEQS" createsubdb "${TMP_PATH}/mappingSeqs" "${OLDCLUST}" "${TMP_PATH}/OLCLUST.withoutDeletedKeys" --subdb-mode 1 ${VERBOSITY} \ 100 || fail "createsubdb died" 101 fi 102 if notExists "${TMP_PATH}/OLCLUST.withoutDeleted.dbtype"; then 103 # shellcheck disable=SC2086 104 "$MMSEQS" filterdb "${TMP_PATH}/OLCLUST.withoutDeletedKeys" "${TMP_PATH}/OLCLUST.withoutDeleted" --filter-file "${TMP_PATH}/removedSeqs" --positive-filter ${THREADS_PAR} \ 105 || fail "filterdb died" 106 fi 107 OLDCLUST="${TMP_PATH}/OLCLUST.withoutDeleted" 108 fi 109fi 110 111if notExists "${TMP_PATH}/newMappingSeqs"; then 112 log "=== Update new sequences with old keys" 113 MAXID="$(awk '$1 > max { max = $1 } END { print max }' "${OLDDB}.index" "${NEWDB}.index")" 114 awk -v highest="$MAXID" 'BEGIN { start=highest+1 } { print $1"\t"start; start=start+1; }' \ 115 "${TMP_PATH}/newSeqs" > "${TMP_PATH}/newSeqs.mapped" 116 awk '{ print $2"\t"$1 }' "${TMP_PATH}/mappingSeqs" > "${TMP_PATH}/mappingSeqs.reverse" 117 cat "${TMP_PATH}/mappingSeqs.reverse" "${TMP_PATH}/newSeqs.mapped" > "${TMP_PATH}/newMappingSeqs" 118 awk '{ print $2 }' "${TMP_PATH}/newSeqs.mapped" > "${TMP_PATH}/newSeqs" 119fi 120 121if notExists "${NEWMAPDB}.dbtype"; then 122 # shellcheck disable=SC2086 123 "$MMSEQS" renamedbkeys "${TMP_PATH}/newMappingSeqs" "${NEWDB}" "${NEWMAPDB}" ${VERBOSITY} \ 124 || fail "renamedbkeys died" 125fi 126NEWDB="${NEWMAPDB}" 127 128if notExists "${TMP_PATH}/NEWDB.newSeqs.dbtype"; then 129 log "=== Filter out new from old sequences" 130 # shellcheck disable=SC2086 131 "$MMSEQS" createsubdb "${TMP_PATH}/newSeqs" "$NEWDB" "${TMP_PATH}/NEWDB.newSeqs" ${VERBOSITY} --subdb-mode 1 \ 132 || fail "createsubdb died" 133fi 134 135if notExists "${TMP_PATH}/OLDDB.repSeq.dbtype"; then 136 log "=== Extract representative sequences" 137 # shellcheck disable=SC2086 138 "$MMSEQS" result2repseq "$OLDDB" "$OLDCLUST" "${TMP_PATH}/OLDDB.repSeq" ${RESULT2REPSEQ_PAR} \ 139 || fail "result2repseq died" 140fi 141 142if notExists "${TMP_PATH}/newSeqsHits.dbtype"; then 143 log "=== Search new sequences against representatives" 144 # shellcheck disable=SC2086 145 "$MMSEQS" search "${TMP_PATH}/NEWDB.newSeqs" "${TMP_PATH}/OLDDB.repSeq" "${TMP_PATH}/newSeqsHits" "${TMP_PATH}/search" ${SEARCH_PAR} \ 146 || fail "search died" 147fi 148 149if notExists "${TMP_PATH}/newSeqsHits.swapped.all.dbtype"; then 150 # shellcheck disable=SC2086 151 "$MMSEQS" swapdb "${TMP_PATH}/newSeqsHits" "${TMP_PATH}/newSeqsHits.swapped.all" ${THREADS_PAR} \ 152 || fail "swapdb died" 153 awk '$3 > 1 { print 1; exit; }' "${TMP_PATH}/newSeqsHits.swapped.all.index" > "${TMP_PATH}/newSeqsHits.swapped.hasHits" 154fi 155 156if [ -s "${TMP_PATH}/newSeqsHits.swapped.hasHits" ] && notExists "${TMP_PATH}/newSeqsHits.swapped.dbtype"; then 157 # shellcheck disable=SC2086 158 "$MMSEQS" filterdb "${TMP_PATH}/newSeqsHits.swapped.all" "${TMP_PATH}/newSeqsHits.swapped" --trim-to-one-column ${THREADS_PAR} \ 159 || fail "filterdb died" 160fi 161 162UPDATEDCLUST="${TMP_PATH}/updatedClust" 163if [ -f "${TMP_PATH}/newSeqsHits.swapped.dbtype" ]; then 164 if notExists "${TMP_PATH}/updatedClust.dbtype"; then 165 log "=== Merge found sequences with previous clustering" 166 # shellcheck disable=SC2086 167 "$MMSEQS" mergedbs "$OLDCLUST" "${TMP_PATH}/updatedClust" "$OLDCLUST" "${TMP_PATH}/newSeqsHits.swapped" ${VERBOSITY} \ 168 || fail "mergedbs died" 169 fi 170else 171 UPDATEDCLUST="$OLDCLUST" 172fi 173 174if notExists "${TMP_PATH}/toBeClusteredSeparately.dbtype"; then 175 log "=== Extract unmapped sequences" 176 awk '$3 == 1 {print $1}' "${TMP_PATH}/newSeqsHits.index" > "${TMP_PATH}/noHitSeqList" 177 # shellcheck disable=SC2086 178 "$MMSEQS" createsubdb "${TMP_PATH}/noHitSeqList" "$NEWDB" "${TMP_PATH}/toBeClusteredSeparately" ${VERBOSITY} --subdb-mode 1 \ 179 || fail "createsubdb of not hit seq. died" 180fi 181 182if notExists "${TMP_PATH}/newClusters.dbtype" && [ -s "${TMP_PATH}/toBeClusteredSeparately.index" ]; then 183 log "=== Cluster separately the singleton sequences" 184 # shellcheck disable=SC2086 185 "$MMSEQS" cluster "${TMP_PATH}/toBeClusteredSeparately" "${TMP_PATH}/newClusters" "${TMP_PATH}/cluster" ${CLUST_PAR} \ 186 || fail "cluster of new seq. died" 187fi 188 189if [ -f "${TMP_PATH}/newClusters.dbtype" ]; then 190 if notExists "$NEWCLUST"; then 191 log "=== Merge updated clustering with new clusters" 192 # shellcheck disable=SC2086 193 "$MMSEQS" concatdbs "${UPDATEDCLUST}" "${TMP_PATH}/newClusters" "$NEWCLUST" --preserve-keys ${THREADS_PAR} \ 194 || fail "concatdbs died" 195 fi 196else 197 # shellcheck disable=SC2086 198 "$MMSEQS" mvdb "${UPDATEDCLUST}" "$NEWCLUST" ${VERBOSITY} 199fi 200 201if [ -n "$REMOVE_TMP" ]; then 202 rm -f "${TMP_PATH}/newSeqs.mapped" "${TMP_PATH}/mappingSeqs.reverse" "${TMP_PATH}/newMappingSeqs" 203 rm -f "${TMP_PATH}/noHitSeqList" "${TMP_PATH}/mappingSeqs" "${TMP_PATH}/newSeqs" "${TMP_PATH}/removedSeqs" 204 rm -f "${TMP_PATH}/newSeqsHits.swapped.hasHits" 205 206 if [ -n "${RECOVER_DELETED}" ]; then 207 # shellcheck disable=SC2086 208 "$MMSEQS" rmdb "${TMP_PATH}/NEWDB.withOld" ${VERBOSITY} 209 # shellcheck disable=SC2086 210 "$MMSEQS" rmdb "${TMP_PATH}/NEWDB.withOld_h" ${VERBOSITY} 211 else 212 # shellcheck disable=SC2086 213 "$MMSEQS" rmdb "${TMP_PATH}/OLCLUST.withoutDeletedKeys" ${VERBOSITY} 214 # shellcheck disable=SC2086 215 "$MMSEQS" rmdb "${TMP_PATH}/OLCLUST.withoutDeleted" ${VERBOSITY} 216 fi 217 218 # shellcheck disable=SC2086 219 "$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits.swapped" ${VERBOSITY} 220 # shellcheck disable=SC2086 221 "$MMSEQS" rmdb "${TMP_PATH}/newClusters" ${VERBOSITY} 222 # shellcheck disable=SC2086 223 "$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits" ${VERBOSITY} 224 # shellcheck disable=SC2086 225 "$MMSEQS" rmdb "${TMP_PATH}/toBeClusteredSeparately" ${VERBOSITY} 226 # shellcheck disable=SC2086 227 "$MMSEQS" rmdb "${TMP_PATH}/NEWDB.newSeqs" ${VERBOSITY} 228 # shellcheck disable=SC2086 229 "$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits.swapped.all" ${VERBOSITY} 230 # shellcheck disable=SC2086 231 "$MMSEQS" rmdb "${TMP_PATH}/OLDDB.repSeq" ${VERBOSITY} 232 # shellcheck disable=SC2086 233 "$MMSEQS" rmdb "${TMP_PATH}/updatedClust" ${VERBOSITY} 234 235 rm -rf "${TMP_PATH}/search" "${TMP_PATH}/cluster" 236 rm -f "${TMP_PATH}/update_clustering.sh" 237fi 238