1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "Text2SequenceWorker.h"
23 
24 #include <U2Core/AppContext.h>
25 #include <U2Core/FailTask.h>
26 #include <U2Core/U2AlphabetUtils.h>
27 
28 #include <U2Designer/DelegateEditors.h>
29 
30 #include <U2Gui/SeqPasterWidgetController.h>
31 
32 #include <U2Lang/ActorPrototypeRegistry.h>
33 #include <U2Lang/BaseActorCategories.h>
34 #include <U2Lang/BasePorts.h>
35 #include <U2Lang/BaseSlots.h>
36 #include <U2Lang/BaseTypes.h>
37 #include <U2Lang/CoreLibConstants.h>
38 #include <U2Lang/WorkflowEnv.h>
39 
40 namespace U2 {
41 namespace LocalWorkflow {
42 
43 const QString Text2SequenceWorkerFactory::ACTOR_ID("convert-text-to-sequence");
44 
45 static const Descriptor TEXT_2_SEQUENCE_IN_TYPE_ID("text-2-sequence-in-type");
46 static const Descriptor TEXT_2_SEQUENCE_OUT_TYPE_ID("text-2-sequence-out-type");
47 
48 static const QString SEQ_NAME_ATTR_ID("sequence-name");
49 static const QString ALPHABET_ATTR_ID("alphabet");
50 static const QString SKIP_SYM_ATTR_ID("skip-unknown");
51 static const QString REPLACE_SYM_ATTR_ID("replace-unknown-with");
52 
53 static const QString SEQ_NAME_ATTR_DEF_VAL("Sequence");
54 static const QString ALPHABET_ATTR_ID_DEF_VAL("Auto");
55 
56 /*******************************
57  * Text2SequenceWorker
58  *******************************/
59 QMap<QString, QString> Text2SequenceWorker::cuteAlIdNames = Text2SequenceWorker::initCuteAlNames();
60 
initCuteAlNames()61 QMap<QString, QString> Text2SequenceWorker::initCuteAlNames() {
62     QMap<QString, QString> res;
63     res[BaseDNAAlphabetIds::RAW()] = "All symbols";
64     res[BaseDNAAlphabetIds::NUCL_DNA_DEFAULT()] = "Standard DNA";
65     res[BaseDNAAlphabetIds::NUCL_RNA_DEFAULT()] = "Standard RNA";
66     res[BaseDNAAlphabetIds::NUCL_DNA_EXTENDED()] = "Extended DNA";
67     res[BaseDNAAlphabetIds::NUCL_RNA_EXTENDED()] = "Extended RNA";
68     res[BaseDNAAlphabetIds::AMINO_DEFAULT()] = "Standard amino";
69     return res;
70 }
71 
init()72 void Text2SequenceWorker::init() {
73     txtPort = ports.value(BasePorts::IN_TEXT_PORT_ID());
74     outSeqPort = ports.value(BasePorts::OUT_SEQ_PORT_ID());
75 }
76 
tick()77 Task *Text2SequenceWorker::tick() {
78     while (txtPort->hasMessage()) {
79         Message inputMessage = getMessageAndSetupScriptValues(txtPort);
80         if (inputMessage.isEmpty()) {
81             outSeqPort->transit();
82             continue;
83         }
84         QString seqName = actor->getParameter(SEQ_NAME_ATTR_ID)->getAttributeValue<QString>(context);
85         if (seqName.isEmpty()) {
86             return new FailTask(tr("Sequence name not set"));
87         }
88         if (tickedNum++ > 0) {
89             seqName += QString::number(tickedNum);
90         }
91         QString alId = actor->getParameter(ALPHABET_ATTR_ID)->getAttributeValue<QString>(context);
92         if (alId.isEmpty()) {
93             alId = ALPHABET_ATTR_ID_DEF_VAL;
94         } else {
95             alId = cuteAlIdNames.key(alId, alId);
96         }
97         bool skipUnknown = actor->getParameter(SKIP_SYM_ATTR_ID)->getAttributeValue<bool>(context);
98         QChar replaceChar;
99         if (!skipUnknown) {
100             QString replaceStr = actor->getParameter(REPLACE_SYM_ATTR_ID)->getAttributeValue<QString>(context);
101             assert(replaceStr.size() <= 1);
102             if (replaceStr.isEmpty()) {
103                 return new FailTask(tr("skip flag should be set or replace character defined"));
104             }
105             replaceChar = replaceStr.at(0);
106         }
107         QByteArray txt = inputMessage.getData().toMap().value(BaseSlots::TEXT_SLOT().getId()).value<QString>().toUtf8();
108 
109         const DNAAlphabet *alphabet = (alId == ALPHABET_ATTR_ID_DEF_VAL) ? U2AlphabetUtils::findBestAlphabet(txt) : U2AlphabetUtils::getById(alId);
110         if (alphabet == nullptr) {
111             QString msg;
112             if (alId == ALPHABET_ATTR_ID_DEF_VAL) {
113                 msg = tr("Alphabet cannot be automatically detected");
114             } else {
115                 msg = tr("Alphabet '%1' cannot be found");
116             }
117             return new FailTask(msg);
118         }
119 
120         QByteArray normSequence = SeqPasterWidgetController::getNormSequence(alphabet, txt, !skipUnknown, replaceChar);
121         DNASequence result(seqName, normSequence, alphabet);
122         QVariantMap msgData;
123         {
124             SharedDbiDataHandler seqId = context->getDataStorage()->putSequence(result);
125             msgData[BaseSlots::DNA_SEQUENCE_SLOT().getId()] = qVariantFromValue<SharedDbiDataHandler>(seqId);
126         }
127         if (outSeqPort) {
128             outSeqPort->put(Message(BaseTypes::DNA_SEQUENCE_TYPE(), msgData));
129         }
130     }
131     if (txtPort->isEnded()) {
132         setDone();
133         outSeqPort->setEnded();
134     }
135     return nullptr;
136 }
137 
cleanup()138 void Text2SequenceWorker::cleanup() {
139 }
140 
141 /*******************************
142  * Text2SequenceWorkerFactory
143  *******************************/
init()144 void Text2SequenceWorkerFactory::init() {
145     // ports description
146     QList<PortDescriptor *> portDescs;
147     {
148         QMap<Descriptor, DataTypePtr> inM;
149         inM[BaseSlots::TEXT_SLOT()] = BaseTypes::STRING_TYPE();
150         DataTypePtr inSet(new MapDataType(TEXT_2_SEQUENCE_IN_TYPE_ID, inM));
151         Descriptor inPortDesc(BasePorts::IN_TEXT_PORT_ID(), Text2SequenceWorker::tr("Input text"), Text2SequenceWorker::tr("A text which will be converted to sequence"));
152         portDescs << new PortDescriptor(inPortDesc, inSet, true);
153 
154         QMap<Descriptor, DataTypePtr> outM;
155         outM[BaseSlots::DNA_SEQUENCE_SLOT()] = BaseTypes::DNA_SEQUENCE_TYPE();
156         DataTypePtr outSet(new MapDataType(TEXT_2_SEQUENCE_OUT_TYPE_ID, outM));
157         Descriptor outPortDesc(BasePorts::OUT_SEQ_PORT_ID(), Text2SequenceWorker::tr("Output sequence"), Text2SequenceWorker::tr("Converted sequence"));
158         portDescs << new PortDescriptor(outPortDesc, outSet, false);
159     }
160     // attributes description
161     QList<Attribute *> attrs;
162     {
163         Descriptor seqNameDesc(SEQ_NAME_ATTR_ID, Text2SequenceWorker::tr("Sequence name"), Text2SequenceWorker::tr("Result sequence name."));
164         Descriptor alphabetDesc(ALPHABET_ATTR_ID, Text2SequenceWorker::tr("Sequence alphabet"), Text2SequenceWorker::tr("Select one of the listed alphabets or choose auto to auto-detect."));
165         Descriptor skipSymbolsDesc(SKIP_SYM_ATTR_ID, Text2SequenceWorker::tr("Skip unknown symbols"), Text2SequenceWorker::tr("Do not include symbols that are not contained in alphabet."));
166         Descriptor replaceSymbolsDesc(REPLACE_SYM_ATTR_ID, Text2SequenceWorker::tr("Replace unknown symbols with"), Text2SequenceWorker::tr("Replace unknown symbols with given character."));
167 
168         attrs << new Attribute(seqNameDesc, BaseTypes::STRING_TYPE(), /* required */ true, QVariant(SEQ_NAME_ATTR_DEF_VAL));
169         attrs << new Attribute(alphabetDesc, BaseTypes::STRING_TYPE(), false, QVariant(ALPHABET_ATTR_ID_DEF_VAL));
170         attrs << new Attribute(skipSymbolsDesc, BaseTypes::BOOL_TYPE(), false, QVariant(true));
171         attrs << new Attribute(replaceSymbolsDesc, BaseTypes::STRING_TYPE(), false);
172     }
173 
174     Descriptor protoDesc(Text2SequenceWorkerFactory::ACTOR_ID,
175                          Text2SequenceWorker::tr("Convert Text to Sequence"),
176                          Text2SequenceWorker::tr("Converts input text to sequence."));
177     ActorPrototype *proto = new IntegralBusActorPrototype(protoDesc, portDescs, attrs);
178 
179     // proto delegates
180     QMap<QString, PropertyDelegate *> delegates;
181     {
182         QVariantMap alMap;
183         QList<const DNAAlphabet *> alps = AppContext::getDNAAlphabetRegistry()->getRegisteredAlphabets();
184         foreach (const DNAAlphabet *a, alps) {
185             alMap[a->getName()] = Text2SequenceWorker::cuteAlIdNames[a->getId()];
186         }
187         alMap[ALPHABET_ATTR_ID_DEF_VAL] = ALPHABET_ATTR_ID_DEF_VAL;
188         delegates[ALPHABET_ATTR_ID] = new ComboBoxDelegate(alMap);
189 
190         delegates[REPLACE_SYM_ATTR_ID] = new CharacterDelegate();
191     }
192     proto->setEditor(new DelegateEditor(delegates));
193     proto->setPrompter(new Text2SequencePrompter());
194 
195     WorkflowEnv::getProtoRegistry()->registerProto(BaseActorCategories::CATEGORY_CONVERTERS(), proto);
196     WorkflowEnv::getDomainRegistry()->getById(LocalDomainFactory::ID)->registerEntry(new Text2SequenceWorkerFactory());
197 }
198 
createWorker(Actor * a)199 Worker *Text2SequenceWorkerFactory::createWorker(Actor *a) {
200     return new Text2SequenceWorker(a);
201 }
202 
203 /*******************************
204  * Text2SequencePrompter
205  *******************************/
composeRichDoc()206 QString Text2SequencePrompter::composeRichDoc() {
207     QString unsetStr = "<font color='red'>" + tr("unset") + "</font>";
208     IntegralBusPort *input = qobject_cast<IntegralBusPort *>(target->getPort(BasePorts::IN_TEXT_PORT_ID()));
209     Actor *txtProducer = input->getProducer(BaseSlots::TEXT_SLOT().getId());
210     QString txtProducetStr = tr(" from <u>%1</u>").arg(txtProducer ? txtProducer->getLabel() : unsetStr);
211 
212     QString seqName = getRequiredParam(SEQ_NAME_ATTR_ID);
213     QString seqNameStr = tr("sequence with name <u>%1</u>").arg(getHyperlink(SEQ_NAME_ATTR_ID, seqName));
214 
215     QString alId = getParameter(ALPHABET_ATTR_ID).value<QString>();
216     QString seqAlStr;
217     if (alId == ALPHABET_ATTR_ID_DEF_VAL) {
218         seqAlStr = getHyperlink(ALPHABET_ATTR_ID, tr("Automatically detect sequence alphabet"));
219     } else {
220         alId = Text2SequenceWorker::cuteAlIdNames.key(alId, "");
221         const DNAAlphabet *alphabet = AppContext::getDNAAlphabetRegistry()->findById(alId);
222         QString alphStr = getHyperlink(ALPHABET_ATTR_ID, alphabet ? alphabet->getName() : unsetStr);
223         seqAlStr = tr("Set sequence alphabet to %1").arg(alphStr);
224     }
225 
226     bool skipUnknown = getParameter(SKIP_SYM_ATTR_ID).value<bool>();
227     QString replaceStr = getRequiredParam(REPLACE_SYM_ATTR_ID);
228     QString unknownSymbolsStr;
229     if (skipUnknown) {
230         unknownSymbolsStr = getHyperlink(SKIP_SYM_ATTR_ID, tr("skipped"));
231     } else {
232         unknownSymbolsStr = QString("%1 %2")
233                                 .arg(getHyperlink(SKIP_SYM_ATTR_ID, tr("replaced with symbol")))
234                                 .arg(getHyperlink(REPLACE_SYM_ATTR_ID, replaceStr));
235     }
236 
237     QString doc = tr("Convert input text%1 to %2. %3. Unknown symbols are %4.")
238                       .arg(txtProducetStr)
239                       .arg(seqNameStr)
240                       .arg(seqAlStr)
241                       .arg(unknownSymbolsStr);
242     return doc;
243 }
244 
245 }  // namespace LocalWorkflow
246 }  // namespace U2
247