1 /*
2    Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
23 */
24 
25 #define DBSPJ_C
26 #include "Dbspj.hpp"
27 
28 #include <ndb_version.h>
29 #include <SectionReader.hpp>
30 #include <signaldata/LqhKey.hpp>
31 #include <signaldata/QueryTree.hpp>
32 #include <signaldata/TcKeyRef.hpp>
33 #include <signaldata/RouteOrd.hpp>
34 #include <signaldata/TransIdAI.hpp>
35 #include <signaldata/DiGetNodes.hpp>
36 #include <signaldata/DihScanTab.hpp>
37 #include <signaldata/AttrInfo.hpp>
38 #include <signaldata/CreateTab.hpp>
39 #include <signaldata/PrepDropTab.hpp>
40 #include <signaldata/DropTab.hpp>
41 #include <signaldata/AlterTab.hpp>
42 #include <signaldata/AlterTable.hpp>
43 #include <signaldata/DbspjErr.hpp>
44 #include <Interpreter.hpp>
45 #include <AttributeHeader.hpp>
46 #include <AttributeDescriptor.hpp>
47 #include <KeyDescriptor.hpp>
48 #include <md5_hash.hpp>
49 #include <signaldata/TcKeyConf.hpp>
50 
51 #include <signaldata/NodeFailRep.hpp>
52 #include <signaldata/ReadNodesConf.hpp>
53 #include <signaldata/SignalDroppedRep.hpp>
54 #include <EventLogger.hpp>
55 #include <Bitmask.hpp>
56 
57 #define JAM_FILE_ID 479
58 
59 extern EventLogger* g_eventLogger;
60 extern Uint32 ErrorSignalReceive;
61 extern Uint32 ErrorMaxSegmentsToSeize;
62 
63 #ifdef VM_TRACE
64 /**
65  * 12 bits are used to represent the 'parent-row-correlation-id'.
66  * Effectively limiting max rows in a batch.
67  */
68 static const Uint32 MaxCorrelationId = (1 << 12);
69 
70 /**
71  * DEBUG options for different parts of SPJ block
72  * Comment out those part you don't want DEBUG'ed.
73  */
74 //#define DEBUG(x) ndbout << "DBSPJ: "<< x << endl
75 //#define DEBUG_DICT(x) ndbout << "DBSPJ: "<< x << endl
76 //#define DEBUG_LQHKEREQ
77 //#define DEBUG_SCAN_FRAGREQ
78 #endif
79 
80 /**
81  * Provide empty defs for those DEBUGs which has to be defined.
82  */
83 #if !defined(DEBUG)
84 #define DEBUG(x)
85 #endif
86 
87 #if !defined(DEBUG_DICT)
88 #define DEBUG_DICT(x)
89 #endif
90 
91 #define DEBUG_CRASH() ndbassert(false)
92 
93 const Ptr<Dbspj::TreeNode> Dbspj::NullTreeNodePtr(0, RNIL );
94 const Dbspj::RowRef Dbspj::NullRowRef = { RNIL, GLOBAL_PAGE_SIZE_WORDS, { 0 } };
95 
96 
execSIGNAL_DROPPED_REP(Signal * signal)97 void Dbspj::execSIGNAL_DROPPED_REP(Signal* signal)
98 {
99   /* An incoming signal was dropped, handle it.
100    * Dropped signal really means that we ran out of
101    * long signal buffering to store its sections.
102    */
103   jamEntry();
104 
105   if (!assembleDroppedFragments(signal))
106   {
107     jam();
108     return;
109   }
110 
111   const SignalDroppedRep* rep = (SignalDroppedRep*) &signal->theData[0];
112   const Uint32 originalGSN= rep->originalGsn;
113 
114   DEBUG("SignalDroppedRep received for GSN " << originalGSN);
115 
116   switch(originalGSN) {
117   case GSN_LQHKEYREQ:  //TC -> SPJ
118   {
119     jam();
120     const LqhKeyReq * const truncatedLqhKeyReq =
121       reinterpret_cast<const LqhKeyReq*>(&rep->originalData[0]);
122 
123     handle_early_lqhkey_ref(signal, truncatedLqhKeyReq,
124                             DbspjErr::OutOfSectionMemory);
125     break;
126   }
127   case GSN_SCAN_FRAGREQ: //TC -> SPJ
128   {
129     jam();
130     /* Get information necessary to send SCAN_FRAGREF back to TC */
131     // TODO : Handle dropped signal fragments
132 
133     const ScanFragReq * const truncatedScanFragReq =
134       reinterpret_cast<const ScanFragReq*>(&rep->originalData[0]);
135 
136     handle_early_scanfrag_ref(signal, truncatedScanFragReq,
137                               DbspjErr::OutOfSectionMemory);
138     break;
139   }
140   case GSN_TRANSID_AI: //TUP -> SPJ
141   {
142     jam();
143     const TransIdAI * const truncatedTransIdAI =
144       reinterpret_cast<const TransIdAI*>(&rep->originalData[0]);
145     const Uint32 ptrI = truncatedTransIdAI->connectPtr;
146 
147     Ptr<TreeNode> treeNodePtr;
148     m_treenode_pool.getPtr(treeNodePtr, ptrI);
149     Ptr<Request> requestPtr;
150     m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
151 
152     /**
153      * Register signal as arrived -> 'done' if this completed this treeNode
154      */
155     ndbassert(treeNodePtr.p->m_info&&treeNodePtr.p->m_info->m_countSignal);
156     (this->*(treeNodePtr.p->m_info->m_countSignal))(signal,
157                                                     requestPtr,
158                                                     treeNodePtr, 1);
159 
160     abort(signal, requestPtr, DbspjErr::OutOfSectionMemory);
161     break;
162   }
163   default:
164     jam();
165     /* Don't expect dropped signals for other GSNs */
166     SimulatedBlock::execSIGNAL_DROPPED_REP(signal);
167   }
168 
169 #ifdef ERROR_INSERT
170   if (ErrorSignalReceive == DBSPJ)
171   {
172     jam();
173     ErrorSignalReceive= 0;
174   }
175 #endif
176 
177   return;
178 }
179 
180 inline
181 Uint32
checkTableError(Uint32 schemaVersion) const182 Dbspj::TableRecord::checkTableError(Uint32 schemaVersion) const
183 {
184   DEBUG_DICT("Dbspj::TableRecord::checkTableError"
185             << ", m_flags: " << m_flags
186             << ", m_currentSchemaVersion: " << m_currentSchemaVersion
187             << ", check schemaVersion: " << schemaVersion);
188 
189   if (!get_enabled())
190     return DbspjErr::NoSuchTable;
191   if (get_dropping())
192     return DbspjErr::DropTableInProgress;
193   if (table_version_major(schemaVersion) != table_version_major(m_currentSchemaVersion))
194     return DbspjErr::WrongSchemaVersion;
195 
196   return 0;
197 }
198 
199 // create table prepare
execTC_SCHVERREQ(Signal * signal)200 void Dbspj::execTC_SCHVERREQ(Signal* signal)
201 {
202   jamEntry();
203   if (! assembleFragments(signal)) {
204     jam();
205     return;
206   }
207   const TcSchVerReq* req = CAST_CONSTPTR(TcSchVerReq, signal->getDataPtr());
208   const Uint32 tableId = req->tableId;
209   const Uint32 senderRef = req->senderRef;
210   const Uint32 senderData = req->senderData;
211 
212   DEBUG_DICT("Dbspj::execTC_SCHVERREQ"
213      << ", tableId: " << tableId
214      << ", version: " << req->tableVersion
215   );
216 
217   TableRecordPtr tablePtr;
218   tablePtr.i = tableId;
219   ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
220 
221   ndbrequire(tablePtr.p->get_prepared() == false);
222   ndbrequire(tablePtr.p->get_enabled() == false);
223   new (tablePtr.p) TableRecord(req->tableVersion);
224 
225   if (req->readBackup)
226   {
227     jam();
228     tablePtr.p->m_flags |= TableRecord::TR_READ_BACKUP;
229   }
230 
231   if (req->fullyReplicated)
232   {
233     jam();
234     tablePtr.p->m_flags |= TableRecord::TR_FULLY_REPLICATED;
235   }
236 
237   /**
238    * NOTE: Even if there are more information, like
239    * 'tableType', 'noOfPrimaryKeys'etc available from
240    * TcSchVerReq, we do *not* store that in TableRecord.
241    * Instead this information is retrieved on demand from
242    * g_key_descriptor_pool where it is readily available.
243    * The 'contract' for consistency of this information is
244    * such that:
245    * 1) g_key_descriptor[ENTRY] will be populated *before*
246    *    any blocks receiving CREATE_TAB_REQ (or equivalent).
247    * 2) g_key_descriptor[ENTRY] will be invalidated *after*
248    *    all blocks sent DROP_TAB_CONF (commit)
249    * Thus, this info is consistent whenever required by SPJ.
250    */
251   TcSchVerConf * conf = (TcSchVerConf*)signal->getDataPtr();
252   conf->senderRef = reference();
253   conf->senderData = senderData;
254   sendSignal(senderRef, GSN_TC_SCHVERCONF, signal,
255              TcSchVerConf::SignalLength, JBB);
256 }//Dbspj::execTC_SCHVERREQ()
257 
258 // create table commit
execTAB_COMMITREQ(Signal * signal)259 void Dbspj::execTAB_COMMITREQ(Signal* signal)
260 {
261   jamEntry();
262   const Uint32 senderData = signal->theData[0];
263   const Uint32 senderRef = signal->theData[1];
264   const Uint32 tableId = signal->theData[2];
265 
266   DEBUG_DICT("Dbspj::execTAB_COMMITREQ"
267      << ", tableId: " << tableId
268   );
269 
270   TableRecordPtr tablePtr;
271   tablePtr.i = tableId;
272   ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
273 
274   ndbrequire(tablePtr.p->get_prepared() == true);
275   ndbrequire(tablePtr.p->get_enabled() == false);
276   tablePtr.p->set_enabled(true);
277   tablePtr.p->set_prepared(false);
278   tablePtr.p->set_dropping(false);
279 
280   signal->theData[0] = senderData;
281   signal->theData[1] = reference();
282   signal->theData[2] = tableId;
283   sendSignal(senderRef, GSN_TAB_COMMITCONF, signal, 3, JBB);
284 }//Dbspj::execTAB_COMMITREQ
285 
286 void
execPREP_DROP_TAB_REQ(Signal * signal)287 Dbspj::execPREP_DROP_TAB_REQ(Signal* signal)
288 {
289   jamEntry();
290 
291   PrepDropTabReq* req = (PrepDropTabReq*)signal->getDataPtr();
292   const Uint32 tableId = req->tableId;
293   const Uint32 senderRef = req->senderRef;
294   const Uint32 senderData = req->senderData;
295 
296   DEBUG_DICT("Dbspj::execPREP_DROP_TAB_REQ"
297      << ", tableId: " << tableId
298   );
299 
300   TableRecordPtr tablePtr;
301   tablePtr.i = tableId;
302   ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
303 
304   if (!tablePtr.p->get_enabled())
305   {
306     jam();
307     PrepDropTabRef* ref = (PrepDropTabRef*)signal->getDataPtrSend();
308     ref->senderRef = reference();
309     ref->senderData = senderData;
310     ref->tableId = tableId;
311     ref->errorCode = PrepDropTabRef::NoSuchTable;
312     sendSignal(senderRef, GSN_PREP_DROP_TAB_REF, signal,
313 	       PrepDropTabRef::SignalLength, JBB);
314     return;
315   }
316 
317   if (tablePtr.p->get_dropping())
318   {
319     jam();
320     PrepDropTabRef* ref = (PrepDropTabRef*)signal->getDataPtrSend();
321     ref->senderRef = reference();
322     ref->senderData = senderData;
323     ref->tableId = tableId;
324     ref->errorCode = PrepDropTabRef::DropInProgress;
325     sendSignal(senderRef, GSN_PREP_DROP_TAB_REF, signal,
326 	       PrepDropTabRef::SignalLength, JBB);
327     return;
328   }
329 
330   tablePtr.p->set_dropping(true);
331   tablePtr.p->set_prepared(false);
332 
333   PrepDropTabConf* conf = (PrepDropTabConf*)signal->getDataPtrSend();
334   conf->tableId = tableId;
335   conf->senderRef = reference();
336   conf->senderData = senderData;
337   sendSignal(senderRef, GSN_PREP_DROP_TAB_CONF, signal,
338              PrepDropTabConf::SignalLength, JBB);
339 }//Dbspj::execPREP_DROP_TAB_REQ
340 
341 void
execDROP_TAB_REQ(Signal * signal)342 Dbspj::execDROP_TAB_REQ(Signal* signal)
343 {
344   jamEntry();
345 
346   const DropTabReq* req = (DropTabReq*)signal->getDataPtr();
347   const Uint32 tableId = req->tableId;
348   const Uint32 senderRef = req->senderRef;
349   const Uint32 senderData = req->senderData;
350   DropTabReq::RequestType rt = (DropTabReq::RequestType)req->requestType;
351 
352   DEBUG_DICT("Dbspj::execDROP_TAB_REQ"
353      << ", tableId: " << tableId
354   );
355 
356   TableRecordPtr tablePtr;
357   tablePtr.i = tableId;
358   ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
359 
360   if (rt == DropTabReq::OnlineDropTab){
361     if (!tablePtr.p->get_enabled()){
362       jam();
363       DropTabRef* ref = (DropTabRef*)signal->getDataPtrSend();
364       ref->senderRef = reference();
365       ref->senderData = senderData;
366       ref->tableId = tableId;
367       ref->errorCode = DropTabRef::NoSuchTable;
368       sendSignal(senderRef, GSN_DROP_TAB_REF, signal,
369 	         DropTabRef::SignalLength, JBB);
370       return;
371     }
372     if (!tablePtr.p->get_dropping()){
373       jam();
374       DropTabRef* ref = (DropTabRef*)signal->getDataPtrSend();
375       ref->senderRef = reference();
376       ref->senderData = senderData;
377       ref->tableId = tableId;
378       ref->errorCode = DropTabRef::DropWoPrep;
379       sendSignal(senderRef, GSN_DROP_TAB_REF, signal,
380 	         DropTabRef::SignalLength, JBB);
381       return;
382     }
383   }
384 
385   tablePtr.p->set_enabled(false);
386   tablePtr.p->set_prepared(false);
387   tablePtr.p->set_dropping(false);
388 
389   DropTabConf * conf = (DropTabConf*)signal->getDataPtrSend();
390   conf->tableId = tableId;
391   conf->senderRef = reference();
392   conf->senderData = senderData;
393   sendSignal(senderRef, GSN_DROP_TAB_CONF, signal,
394 	     PrepDropTabConf::SignalLength, JBB);
395 }//Dbspj::execDROP_TAB_REQ
396 
397 void
execALTER_TAB_REQ(Signal * signal)398 Dbspj::execALTER_TAB_REQ(Signal* signal)
399 {
400   jamEntry();
401 
402   const AlterTabReq* req = (const AlterTabReq*)signal->getDataPtr();
403   const Uint32 tableId = req->tableId;
404   const Uint32 senderRef = req->senderRef;
405   const Uint32 senderData = req->senderData;
406   const Uint32 tableVersion = req->tableVersion;
407   const Uint32 newTableVersion = req->newTableVersion;
408   AlterTabReq::RequestType requestType =
409     (AlterTabReq::RequestType) req->requestType;
410   D("ALTER_TAB_REQ(SPJ)");
411 
412   DEBUG_DICT("Dbspj::execALTER_TAB_REQ"
413      << ", tableId: " << tableId
414      << ", version: " << tableVersion << " --> " << newTableVersion
415   );
416 
417   TableRecordPtr tablePtr;
418   tablePtr.i = tableId;
419   ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
420 
421   switch (requestType) {
422   case AlterTabReq::AlterTablePrepare:
423     jam();
424     break;
425   case AlterTabReq::AlterTableRevert:
426     jam();
427     tablePtr.p->m_currentSchemaVersion = tableVersion;
428     break;
429   case AlterTabReq::AlterTableCommit:
430     jam();
431     tablePtr.p->m_currentSchemaVersion = newTableVersion;
432     if (AlterTableReq::getReadBackupFlag(req->changeMask))
433     {
434       /**
435        * We simply swap the flag, the preparatory work for this
436        * change is done in DBTC.
437        */
438       if ((tablePtr.p->m_flags & TableRecord::TR_READ_BACKUP) != 0)
439       {
440         jam();
441         /* Reset Read Backup flag */
442         tablePtr.p->m_flags &= (~(TableRecord::TR_READ_BACKUP));
443       }
444       else
445       {
446         jam();
447         /* Set Read Backup flag */
448         tablePtr.p->m_flags |= TableRecord::TR_READ_BACKUP;
449       }
450     }
451     break;
452   default:
453     ndbabort();
454   }
455 
456   AlterTabConf* conf = (AlterTabConf*)signal->getDataPtrSend();
457   conf->senderRef = reference();
458   conf->senderData = senderData;
459   conf->connectPtr = RNIL;
460   sendSignal(senderRef, GSN_ALTER_TAB_CONF, signal,
461 	     AlterTabConf::SignalLength, JBB);
462 }//Dbspj::execALTER_TAB_REQ
463 
464 /** A noop for now.*/
execREAD_CONFIG_REQ(Signal * signal)465 void Dbspj::execREAD_CONFIG_REQ(Signal* signal)
466 {
467   jamEntry();
468   const ReadConfigReq req =
469     *reinterpret_cast<const ReadConfigReq*>(signal->getDataPtr());
470 
471   Pool_context pc;
472   pc.m_block = this;
473 
474   DEBUG("execREAD_CONFIG_REQ");
475   DEBUG("sizeof(Request): " << sizeof(Request) <<
476         " sizeof(TreeNode): " << sizeof(TreeNode));
477 
478   m_arenaAllocator.init(1024, RT_SPJ_ARENA_BLOCK, pc);
479   m_request_pool.arena_pool_init(&m_arenaAllocator, RT_SPJ_REQUEST, pc);
480   m_treenode_pool.arena_pool_init(&m_arenaAllocator, RT_SPJ_TREENODE, pc);
481   m_scanfraghandle_pool.arena_pool_init(&m_arenaAllocator, RT_SPJ_SCANFRAG, pc);
482   m_lookup_request_hash.setSize(16);
483   m_scan_request_hash.setSize(16);
484   void* ptr = m_ctx.m_mm.get_memroot();
485   m_page_pool.set((RowPage*)ptr, (Uint32)~0);
486 
487   Record_info ri;
488   Dependency_map::createRecordInfo(ri, RT_SPJ_DATABUFFER);
489   m_dependency_map_pool.init(&m_arenaAllocator, ri, pc);
490 
491   {
492     const ndb_mgm_configuration_iterator * p =
493       m_ctx.m_config.getOwnConfigIterator();
494     ndbrequire(p != 0);
495 
496     ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_SPJ_TABLE, &c_tabrecFilesize));
497   }
498   m_tableRecord = (TableRecord*)allocRecord("TableRecord",
499                                             sizeof(TableRecord),
500                                             c_tabrecFilesize);
501 
502   TableRecordPtr tablePtr;
503   for (tablePtr.i = 0; tablePtr.i < c_tabrecFilesize; tablePtr.i++) {
504     ptrAss(tablePtr, m_tableRecord);
505     new (tablePtr.p) TableRecord;
506   }//for
507 
508   ReadConfigConf* const conf =
509     reinterpret_cast<ReadConfigConf*>(signal->getDataPtrSend());
510   conf->senderRef = reference();
511   conf->senderData = req.senderData;
512 
513   sendSignal(req.senderRef, GSN_READ_CONFIG_CONF, signal,
514              ReadConfigConf::SignalLength, JBB);
515 }//Dbspj::execREAD_CONF_REQ()
516 
517 static Uint32 f_STTOR_REF = 0;
518 
execSTTOR(Signal * signal)519 void Dbspj::execSTTOR(Signal* signal)
520 {
521 //#define UNIT_TEST_DATABUFFER2
522 
523   jamEntry();
524   /* START CASE */
525   const Uint16 tphase = signal->theData[1];
526   f_STTOR_REF = signal->getSendersBlockRef();
527 
528   if (tphase == 1)
529   {
530     jam();
531     signal->theData[0] = 0;
532     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 1000, 1);
533   }
534 
535   if (tphase == 4)
536   {
537     jam();
538 
539     signal->theData[0] = reference();
540     sendSignal(NDBCNTR_REF, GSN_READ_NODESREQ, signal, 1, JBB);
541     return;
542   }
543 
544   sendSTTORRY(signal);
545 
546 #ifdef UNIT_TEST_DATABUFFER2
547   if (tphase == 120)
548   {
549     ndbout_c("basic test of ArenaPool / DataBuffer");
550 
551     for (Uint32 i = 0; i<100; i++)
552     {
553       ArenaHead ah;
554       if (!m_arenaAllocator.seize(ah))
555       {
556         ndbout_c("Failed to allocate arena");
557         break;
558       }
559 
560       ndbout_c("*** LOOP %u", i);
561       Uint32 sum = 0;
562       Dependency_map::Head head;
563       LocalArenaPool<DataBufferSegment<14> > pool(ah, m_dependency_map_pool);
564       for (Uint32 j = 0; j<100; j++)
565       {
566         Uint32 sz = rand() % 1000;
567         if (0)
568           ndbout_c("adding %u", sz);
569         Local_dependency_map list(pool, head);
570         for (Uint32 i = 0; i<sz; i++)
571           signal->theData[i] = sum + i;
572         list.append(signal->theData, sz);
573         sum += sz;
574       }
575 
576       {
577         ndbrequire(head.getSize() == sum);
578         Local_dependency_map list(pool, head);
579         Dependency_map::ConstDataBufferIterator it;
580         Uint32 cnt = 0;
581         for (list.first(it); !it.isNull(); list.next(it))
582         {
583           ndbrequire(* it.data == cnt);
584           cnt++;
585         }
586 
587         ndbrequire(cnt == sum);
588       }
589 
590       Resource_limit rl;
591       if (m_ctx.m_mm.get_resource_limit(7, rl))
592       {
593         ndbout_c("Resource %d min: %d max: %d curr: %d",
594                  7, rl.m_min, rl.m_max, rl.m_curr);
595       }
596 
597       {
598         ndbout_c("release map");
599         Local_dependency_map list(pool, head);
600         list.release();
601       }
602 
603       ndbout_c("release all");
604       m_arenaAllocator.release(ah);
605       ndbout_c("*** LOOP %u sum: %u", i, sum);
606     }
607   }
608 #endif
609 }//Dbspj::execSTTOR()
610 
611 void
sendSTTORRY(Signal * signal)612 Dbspj::sendSTTORRY(Signal* signal)
613 {
614   signal->theData[0] = 0;
615   signal->theData[1] = 0;    /* BLOCK CATEGORY */
616   signal->theData[2] = 0;    /* SIGNAL VERSION NUMBER */
617   signal->theData[3] = 4;
618 #ifdef UNIT_TEST_DATABUFFER2
619   signal->theData[4] = 120;  /* Start phase end*/
620 #else
621   signal->theData[4] = 255;
622 #endif
623   signal->theData[5] = 255;
624   sendSignal(f_STTOR_REF, GSN_STTORRY, signal, 6, JBB);
625 }
626 
627 void
execREAD_NODESCONF(Signal * signal)628 Dbspj::execREAD_NODESCONF(Signal* signal)
629 {
630   jamEntry();
631 
632   ReadNodesConf * const conf = (ReadNodesConf *)signal->getDataPtr();
633   {
634     ndbrequire(signal->getNoOfSections() == 1);
635     SegmentedSectionPtr ptr;
636     SectionHandle handle(this, signal);
637     handle.getSection(ptr, 0);
638     ndbrequire(ptr.sz == 5 * NdbNodeBitmask::Size);
639     copy((Uint32*)&conf->definedNodes.rep.data, ptr);
640     releaseSections(handle);
641   }
642 
643   if (getNodeState().getNodeRestartInProgress())
644   {
645     jam();
646     c_alive_nodes = conf->startedNodes;
647     c_alive_nodes.set(getOwnNodeId());
648   }
649   else
650   {
651     jam();
652     c_alive_nodes = conf->startingNodes;
653     NdbNodeBitmask tmp = conf->startedNodes;
654     c_alive_nodes.bitOR(tmp);
655   }
656 
657   for (Uint32 i = 0; i < MAX_NDB_NODES; i++)
658   {
659     m_location_domain_id[i] = 0;
660   }
661 
662   ndb_mgm_configuration *p =
663     m_ctx.m_config.getClusterConfig();
664   ndb_mgm_configuration_iterator *p_iter =
665     ndb_mgm_create_configuration_iterator(p, CFG_SECTION_NODE);
666 
667   for (ndb_mgm_first(p_iter);
668        ndb_mgm_valid(p_iter);
669        ndb_mgm_next(p_iter))
670   {
671     jam();
672     Uint32 location_domain_id = 0;
673     Uint32 nodeId = 0;
674     Uint32 nodeType = 0;
675     ndbrequire(!ndb_mgm_get_int_parameter(p_iter, CFG_NODE_ID, &nodeId) &&
676                nodeId != 0);
677     jamLine(Uint16(nodeId));
678     ndbrequire(!ndb_mgm_get_int_parameter(p_iter,
679                                           CFG_TYPE_OF_SECTION,
680                                           &nodeType));
681     ndbrequire(nodeId != 0);
682     if (nodeType != NODE_TYPE_DB)
683     {
684       jam();
685       continue;
686     }
687     ndbrequire(nodeId < MAX_NDB_NODES);
688     ndb_mgm_get_int_parameter(p_iter,
689                               CFG_LOCATION_DOMAIN_ID,
690                               &location_domain_id);
691     m_location_domain_id[nodeId] = location_domain_id;
692   }
693   ndb_mgm_destroy_iterator(p_iter);
694   sendSTTORRY(signal);
695 }
696 
697 void
execINCL_NODEREQ(Signal * signal)698 Dbspj::execINCL_NODEREQ(Signal* signal)
699 {
700   jamEntry();
701   const Uint32 senderRef = signal->theData[0];
702   const Uint32 nodeId  = signal->theData[1];
703 
704   ndbrequire(!c_alive_nodes.get(nodeId));
705   c_alive_nodes.set(nodeId);
706 
707   signal->theData[0] = nodeId;
708   signal->theData[1] = reference();
709   sendSignal(senderRef, GSN_INCL_NODECONF, signal, 2, JBB);
710 }
711 
712 void
execNODE_FAILREP(Signal * signal)713 Dbspj::execNODE_FAILREP(Signal* signal)
714 {
715   jamEntry();
716 
717   NodeFailRep * rep = (NodeFailRep*)signal->getDataPtr();
718   if(signal->getLength() == NodeFailRep::SignalLength)
719   {
720     ndbrequire(signal->getNoOfSections() == 1);
721     ndbrequire(getNodeInfo(refToNode(signal->getSendersBlockRef())).m_version);
722     SegmentedSectionPtr ptr;
723     SectionHandle handle(this, signal);
724     handle.getSection(ptr, 0);
725     memset(rep->theNodes, 0, sizeof(rep->theNodes));
726     copy(rep->theNodes, ptr);
727     releaseSections(handle);
728   }
729   else
730   {
731     memset(rep->theNodes + NdbNodeBitmask48::Size,
732            0,
733            _NDB_NBM_DIFF_BYTES);
734   }
735   NdbNodeBitmask failed;
736   failed.assign(NdbNodeBitmask::Size, rep->theNodes);
737 
738   c_alive_nodes.bitANDC(failed);
739 
740   /* Clean up possibly fragmented signals being received or sent */
741   for (Uint32 node = 1; node < MAX_NDB_NODES; node++)
742   {
743     if (failed.get(node))
744     {
745       jam();
746       simBlockNodeFailure(signal, node);
747     }//if
748   }//for
749 
750   signal->theData[0] = 1;
751   signal->theData[1] = 0;
752   failed.copyto(NdbNodeBitmask::Size, signal->theData + 2);
753   LinearSectionPtr lsptr[3];
754   lsptr[0].p = signal->theData + 2;
755   lsptr[0].sz = failed.getPackedLengthInWords();
756   sendSignal(reference(), GSN_CONTINUEB, signal, 2,
757              JBB, lsptr, 1);
758 }
759 
760 void
execAPI_FAILREQ(Signal * signal)761 Dbspj::execAPI_FAILREQ(Signal* signal)
762 {
763   jamEntry();
764   Uint32 failedApiNode = signal->theData[0];
765   Uint32 ref = signal->theData[1];
766 
767   /**
768    * We only need to care about lookups
769    *   as SCAN's are aborted by DBTC
770    *
771    * As SPJ does not receive / send fragmented signals
772    *   directly to API nodes, simBlockNodeFailure()
773    *   should not really be required - assert this.
774    */
775   Uint32 elementsCleaned = simBlockNodeFailure(signal, failedApiNode);
776   ndbassert(elementsCleaned == 0); // As SPJ has no fragmented API signals
777   (void) elementsCleaned;          // Avoid compiler error
778 
779   signal->theData[0] = failedApiNode;
780   signal->theData[1] = reference();
781   sendSignal(ref, GSN_API_FAILCONF, signal, 2, JBB);
782 }
783 
784 void
execCONTINUEB(Signal * signal)785 Dbspj::execCONTINUEB(Signal* signal)
786 {
787   jamEntry();
788   switch(signal->theData[0]) {
789   case 0:
790     releaseGlobal(signal);
791     return;
792   case 1:
793     nodeFail_checkRequests(signal);
794     return;
795   case 2:
796     nodeFail_checkRequests(signal);
797     return;
798   case 3:
799   {
800     Ptr<TreeNode> treeNodePtr;
801     Ptr<Request> requestPtr;
802     m_treenode_pool.getPtr(treeNodePtr, signal->theData[1]);
803     m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
804     scanFrag_sendDihGetNodesReq(signal, requestPtr, treeNodePtr);
805     checkPrepareComplete(signal, requestPtr);
806     return;
807   }
808   }
809 
810   ndbabort();
811 }
812 
813 void
nodeFail_checkRequests(Signal * signal)814 Dbspj::nodeFail_checkRequests(Signal* signal)
815 {
816   jam();
817   const Uint32 type = signal->theData[0];
818   const Uint32 bucket = signal->theData[1];
819 
820   NdbNodeBitmask failed;
821   ndbrequire(signal->getNoOfSections() == 1);
822 
823   SegmentedSectionPtr ptr;
824   SectionHandle handle(this,signal);
825   handle.getSection(ptr, 0);
826   ndbrequire(ptr.sz <= NdbNodeBitmask::Size);
827   copy(failed.rep.data, ptr);
828   releaseSections(handle);
829 
830   Request_iterator iter;
831   Request_hash * hash = NULL;
832   switch(type){
833   case 1:
834     hash = &m_lookup_request_hash;
835     break;
836   case 2:
837     hash = &m_scan_request_hash;
838     break;
839   default:
840     hash = NULL; //Silence compiler warning
841     ndbabort(); //Impossible, avoid warning
842   }
843   hash->next(bucket, iter);
844 
845   const Uint32 RT_BREAK = 64;
846   for(Uint32 i = 0; (i<RT_BREAK || iter.bucket == bucket) &&
847         !iter.curr.isNull(); i++)
848   {
849     jam();
850 
851     Ptr<Request> requestPtr = iter.curr;
852     hash->next(iter);
853     i += nodeFail(signal, requestPtr, failed);
854   }
855 
856   if (!iter.curr.isNull())
857   {
858     jam();
859     signal->theData[0] = type;
860     signal->theData[1] = bucket;
861     failed.copyto(NdbNodeBitmask::Size, signal->theData + 2);
862     LinearSectionPtr lsptr[3];
863     lsptr[0].p = signal->theData + 2;
864     lsptr[0].sz = failed.getPackedLengthInWords();
865     sendSignal(reference(), GSN_CONTINUEB, signal, 2,
866                JBB, lsptr, 1);
867   }
868   else if (type == 1)
869   {
870     jam();
871     signal->theData[0] = 2;
872     signal->theData[1] = 0;
873     failed.copyto(NdbNodeBitmask::Size, signal->theData+2);
874     LinearSectionPtr lsptr[3];
875     lsptr[0].p = signal->theData + 2;
876     lsptr[0].sz = failed.getPackedLengthInWords();
877     sendSignal(reference(), GSN_CONTINUEB, signal, 2,
878                JBB, lsptr, 1);
879   }
880   else if (type == 2)
881   {
882     jam();
883   }
884 }
885 
886 /**
887  * MODULE LQHKEYREQ
888  */
execLQHKEYREQ(Signal * signal)889 void Dbspj::execLQHKEYREQ(Signal* signal)
890 {
891   jamEntry();
892   if (unlikely(!assembleFragments(signal)))
893   {
894     jam();
895     return;
896   }
897 
898   c_Counters.incr_counter(CI_READS_RECEIVED, 1);
899 
900   if (ERROR_INSERTED(17014))
901   {
902     ndbrequire(refToNode(signal->getSendersBlockRef()) == getOwnNodeId());
903   }
904 
905   const LqhKeyReq* req = reinterpret_cast<const LqhKeyReq*>(signal->getDataPtr());
906 
907   /**
908    * #0 - KEYINFO contains key for first operation (used for hash in TC)
909    * #1 - ATTRINFO contains tree + parameters
910    *      (unless StoredProcId is set, when only paramters are sent,
911    *       but this is not yet implemented)
912    */
913   SegmentedSectionPtr attrPtr;
914   SectionHandle handle(this, signal);
915   handle.getSection(attrPtr, LqhKeyReq::AttrInfoSectionNum);
916   const Uint32 keyPtrI = handle.m_ptr[LqhKeyReq::KeyInfoSectionNum].i;
917 
918   Uint32 err;
919   Ptr<Request> requestPtr(0, RNIL);
920   do
921   {
922     ArenaHead ah;
923     err = DbspjErr::OutOfQueryMemory;
924     if (unlikely(!m_arenaAllocator.seize(ah)))
925       break;
926 
927     if (ERROR_INSERTED_CLEAR(17001))
928     {
929       jam();
930       ndbout_c("Injecting OutOfQueryMem error 17001 at line %d file %s",
931                 __LINE__,  __FILE__);
932       break;
933     }
934     if (unlikely(!m_request_pool.seize(ah, requestPtr)))
935     {
936       jam();
937       break;
938     }
939     new (requestPtr.p) Request(ah);
940     do_init(requestPtr.p, req, signal->getSendersBlockRef());
941 
942     Uint32 len_cnt;
943 
944     {
945       SectionReader r0(attrPtr, getSectionSegmentPool());
946 
947       err = DbspjErr::ZeroLengthQueryTree;
948       if (unlikely(!r0.getWord(&len_cnt)))
949         break;
950     }
951 
952     Uint32 len = QueryTree::getLength(len_cnt);
953     Uint32 cnt = QueryTree::getNodeCnt(len_cnt);
954 
955     {
956       SectionReader treeReader(attrPtr, getSectionSegmentPool());
957       SectionReader paramReader(attrPtr, getSectionSegmentPool());
958       paramReader.step(len); // skip over tree to parameters
959 
960       Build_context ctx;
961       ctx.m_resultRef = req->variableData[0];
962       ctx.m_savepointId = req->savePointId;
963       ctx.m_scanPrio = 1;
964       ctx.m_start_signal = signal;
965       ctx.m_senderRef = signal->getSendersBlockRef();
966 
967       err = build(ctx, requestPtr, treeReader, paramReader);
968       if (unlikely(err != 0))
969         break;
970 
971       /**
972        * Root TreeNode in Request takes ownership of keyPtr
973        * section when build has completed.
974        * We are done with attrPtr which are now released.
975        */
976       Ptr<TreeNode> rootNodePtr = ctx.m_node_list[0];
977       rootNodePtr.p->m_send.m_keyInfoPtrI = keyPtrI;
978       release(attrPtr);
979       handle.clear();
980     }
981 
982     /**
983      * Store request in list(s)/hash(es)
984      */
985     store_lookup(requestPtr);
986 
987     /**
988      * A query being shipped as a LQHKEYREQ may return at most a row
989      * per operation i.e be a (multi-)lookup
990      */
991     if (ERROR_INSERTED_CLEAR(17013) ||
992         unlikely(!requestPtr.p->isLookup() || requestPtr.p->m_node_cnt != cnt))
993     {
994       jam();
995       err = DbspjErr::InvalidRequest;
996       break;
997     }
998 
999     prepare(signal, requestPtr);
1000     checkPrepareComplete(signal, requestPtr);
1001     return;
1002   } while (0);
1003 
1004   /**
1005    * Error handling below,
1006    *  'err' should contain error code.
1007    */
1008   ndbassert(err != 0);
1009   if (!requestPtr.isNull())
1010   {
1011     jam();
1012     cleanup(requestPtr);
1013   }
1014   releaseSections(handle);  // a NOOP, if we reached 'handle.clear()' above
1015   handle_early_lqhkey_ref(signal, req, err);
1016 }
1017 
1018 void
do_init(Request * requestP,const LqhKeyReq * req,Uint32 senderRef)1019 Dbspj::do_init(Request* requestP, const LqhKeyReq* req, Uint32 senderRef)
1020 {
1021   requestP->m_bits = 0;
1022   requestP->m_errCode = 0;
1023   requestP->m_state = Request::RS_BUILDING;
1024   requestP->m_node_cnt = 0;
1025   requestP->m_cnt_active = 0;
1026   requestP->m_rows = 0;
1027   requestP->m_active_tree_nodes.clear();
1028   requestP->m_completed_tree_nodes.set();
1029   requestP->m_outstanding = 0;
1030   requestP->m_transId[0] = req->transId1;
1031   requestP->m_transId[1] = req->transId2;
1032   requestP->m_rootFragId = LqhKeyReq::getFragmentId(req->fragmentData);
1033   requestP->m_rootFragCnt = 1;
1034   bzero(requestP->m_lookup_node_data, sizeof(requestP->m_lookup_node_data));
1035 #ifdef SPJ_TRACE_TIME
1036   requestP->m_cnt_batches = 0;
1037   requestP->m_sum_rows = 0;
1038   requestP->m_sum_running = 0;
1039   requestP->m_sum_waiting = 0;
1040   requestP->m_save_time = NdbTick_getCurrentTicks();
1041 #endif
1042   const Uint32 reqInfo = req->requestInfo;
1043   Uint32 tmp = req->clientConnectPtr;
1044   if (LqhKeyReq::getDirtyFlag(reqInfo) &&
1045       LqhKeyReq::getOperation(reqInfo) == ZREAD)
1046   {
1047     jam();
1048 
1049     ndbrequire(LqhKeyReq::getApplicationAddressFlag(reqInfo));
1050     //const Uint32 apiRef   = lqhKeyReq->variableData[0];
1051     //const Uint32 apiOpRec = lqhKeyReq->variableData[1];
1052     tmp = req->variableData[1];
1053     requestP->m_senderData = tmp;
1054     requestP->m_senderRef = senderRef;
1055   }
1056   else
1057   {
1058     if (LqhKeyReq::getSameClientAndTcFlag(reqInfo) == 1)
1059     {
1060       if (LqhKeyReq::getApplicationAddressFlag(reqInfo))
1061         tmp = req->variableData[2];
1062       else
1063         tmp = req->variableData[0];
1064     }
1065     requestP->m_senderData = tmp;
1066     requestP->m_senderRef = senderRef;
1067   }
1068   requestP->m_rootResultData = tmp;
1069 }
1070 
1071 void
store_lookup(Ptr<Request> requestPtr)1072 Dbspj::store_lookup(Ptr<Request> requestPtr)
1073 {
1074   ndbassert(requestPtr.p->isLookup());
1075   Ptr<Request> tmp;
1076   bool found = m_lookup_request_hash.find(tmp, *requestPtr.p);
1077   ndbrequire(found == false);
1078   m_lookup_request_hash.add(requestPtr);
1079 }
1080 
1081 void
handle_early_lqhkey_ref(Signal * signal,const LqhKeyReq * lqhKeyReq,Uint32 err)1082 Dbspj::handle_early_lqhkey_ref(Signal* signal,
1083                                const LqhKeyReq * lqhKeyReq,
1084                                Uint32 err)
1085 {
1086   /**
1087    * Error path...
1088    */
1089   ndbrequire(err);
1090   const Uint32 reqInfo = lqhKeyReq->requestInfo;
1091   const Uint32 transid[2] = { lqhKeyReq->transId1, lqhKeyReq->transId2 };
1092 
1093   if (LqhKeyReq::getDirtyFlag(reqInfo) &&
1094       LqhKeyReq::getOperation(reqInfo) == ZREAD)
1095   {
1096     jam();
1097     /* Dirty read sends TCKEYREF direct to client, and nothing to TC */
1098     ndbrequire(LqhKeyReq::getApplicationAddressFlag(reqInfo));
1099     const Uint32 apiRef   = lqhKeyReq->variableData[0];
1100     const Uint32 apiOpRec = lqhKeyReq->variableData[1];
1101 
1102     TcKeyRef* const tcKeyRef = reinterpret_cast<TcKeyRef*>(signal->getDataPtrSend());
1103 
1104     tcKeyRef->connectPtr = apiOpRec;
1105     tcKeyRef->transId[0] = transid[0];
1106     tcKeyRef->transId[1] = transid[1];
1107     tcKeyRef->errorCode = err;
1108     sendTCKEYREF(signal, apiRef, signal->getSendersBlockRef());
1109   }
1110   else
1111   {
1112     jam();
1113     const Uint32 returnref = signal->getSendersBlockRef();
1114     const Uint32 clientPtr = lqhKeyReq->clientConnectPtr;
1115 
1116     Uint32 TcOprec = clientPtr;
1117     if (LqhKeyReq::getSameClientAndTcFlag(reqInfo) == 1)
1118     {
1119       if (LqhKeyReq::getApplicationAddressFlag(reqInfo))
1120         TcOprec = lqhKeyReq->variableData[2];
1121       else
1122         TcOprec = lqhKeyReq->variableData[0];
1123     }
1124 
1125     LqhKeyRef* const ref = reinterpret_cast<LqhKeyRef*>(signal->getDataPtrSend());
1126     ref->userRef = clientPtr;
1127     ref->connectPtr = TcOprec;
1128     ref->errorCode = err;
1129     ref->transId1 = transid[0];
1130     ref->transId2 = transid[1];
1131     sendSignal(returnref, GSN_LQHKEYREF, signal,
1132                LqhKeyRef::SignalLength, JBB);
1133   }
1134 }
1135 
1136 void
sendTCKEYREF(Signal * signal,Uint32 ref,Uint32 routeRef)1137 Dbspj::sendTCKEYREF(Signal* signal, Uint32 ref, Uint32 routeRef)
1138 {
1139   const Uint32 nodeId = refToNode(ref);
1140   const bool connectedToNode = getNodeInfo(nodeId).m_connected;
1141 
1142   if (likely(connectedToNode))
1143   {
1144     jam();
1145     sendSignal(ref, GSN_TCKEYREF, signal, TcKeyRef::SignalLength, JBB);
1146   }
1147   else
1148   {
1149     jam();
1150     memmove(signal->theData+25, signal->theData, 4*TcKeyRef::SignalLength);
1151     RouteOrd* ord = (RouteOrd*)signal->getDataPtrSend();
1152     ord->dstRef = ref;
1153     ord->srcRef = reference();
1154     ord->gsn = GSN_TCKEYREF;
1155     ord->cnt = 0;
1156     LinearSectionPtr ptr[3];
1157     ptr[0].p = signal->theData+25;
1158     ptr[0].sz = TcKeyRef::SignalLength;
1159     sendSignal(routeRef, GSN_ROUTE_ORD, signal, RouteOrd::SignalLength, JBB,
1160                ptr, 1);
1161   }
1162 }
1163 
1164 void
sendTCKEYCONF(Signal * signal,Uint32 len,Uint32 ref,Uint32 routeRef)1165 Dbspj::sendTCKEYCONF(Signal* signal, Uint32 len, Uint32 ref, Uint32 routeRef)
1166 {
1167   const Uint32 nodeId = refToNode(ref);
1168   const bool connectedToNode = getNodeInfo(nodeId).m_connected;
1169 
1170   if (likely(connectedToNode))
1171   {
1172     jam();
1173     sendSignal(ref, GSN_TCKEYCONF, signal, len, JBB);
1174   }
1175   else
1176   {
1177     jam();
1178     memmove(signal->theData+25, signal->theData, 4*len);
1179     RouteOrd* ord = (RouteOrd*)signal->getDataPtrSend();
1180     ord->dstRef = ref;
1181     ord->srcRef = reference();
1182     ord->gsn = GSN_TCKEYCONF;
1183     ord->cnt = 0;
1184     LinearSectionPtr ptr[3];
1185     ptr[0].p = signal->theData+25;
1186     ptr[0].sz = len;
1187     sendSignal(routeRef, GSN_ROUTE_ORD, signal, RouteOrd::SignalLength, JBB,
1188                ptr, 1);
1189   }
1190 }
1191 
1192 /**
1193  * END - MODULE LQHKEYREQ
1194  */
1195 
1196 
1197 /**
1198  * MODULE SCAN_FRAGREQ
1199  */
1200 void
execSCAN_FRAGREQ(Signal * signal)1201 Dbspj::execSCAN_FRAGREQ(Signal* signal)
1202 {
1203   jamEntry();
1204 
1205   /* Reassemble if the request was fragmented */
1206   if (!assembleFragments(signal))
1207   {
1208     jam();
1209     return;
1210   }
1211 
1212   if (ERROR_INSERTED(17014))
1213   {
1214     ndbrequire(refToNode(signal->getSendersBlockRef()) == getOwnNodeId());
1215   }
1216 
1217   const ScanFragReq * req = (ScanFragReq *)&signal->theData[0];
1218 
1219 #ifdef DEBUG_SCAN_FRAGREQ
1220   ndbout_c("Incomming SCAN_FRAGREQ ");
1221   printSCAN_FRAGREQ(stdout, signal->getDataPtrSend(),
1222                     ScanFragReq::SignalLength + 2,
1223                     DBLQH);
1224 #endif
1225 
1226   /**
1227    * #0 - ATTRINFO contains tree + parameters
1228    *      (unless StoredProcId is set, when only paramters are sent,
1229    *       but this is not yet implemented)
1230    * #1 - KEYINFO if first op is index scan - contains bounds for first scan
1231    *              if first op is lookup - contains keyinfo for lookup
1232    */
1233   SectionHandle handle(this, signal);
1234   SegmentedSectionPtr attrPtr;
1235   handle.getSection(attrPtr, ScanFragReq::AttrInfoSectionNum);
1236 
1237   Uint32 err;
1238   Ptr<Request> requestPtr(0, RNIL);
1239   do
1240   {
1241     ArenaHead ah;
1242     err = DbspjErr::OutOfQueryMemory;
1243     if (unlikely(!m_arenaAllocator.seize(ah)))
1244       break;
1245 
1246     if (ERROR_INSERTED_CLEAR(17002))
1247     {
1248       ndbout_c("Injecting OutOfQueryMem error 17002 at line %d file %s",
1249                 __LINE__,  __FILE__);
1250       jam();
1251       break;
1252     }
1253     if (unlikely(!m_request_pool.seize(ah, requestPtr)))
1254     {
1255       jam();
1256       break;
1257     }
1258     new (requestPtr.p) Request(ah);
1259     do_init(requestPtr.p, req, signal->getSendersBlockRef());
1260 
1261     Uint32 len_cnt;
1262     {
1263       SectionReader r0(attrPtr, getSectionSegmentPool());
1264       err = DbspjErr::ZeroLengthQueryTree;
1265       if (unlikely(!r0.getWord(&len_cnt)))
1266         break;
1267     }
1268 
1269     Uint32 len = QueryTree::getLength(len_cnt);
1270     Uint32 cnt = QueryTree::getNodeCnt(len_cnt);
1271 
1272     Uint32 sectionCnt = handle.m_cnt;
1273     Uint32 fragIdsPtrI = RNIL;
1274     if (ScanFragReq::getMultiFragFlag(req->requestInfo))
1275     {
1276       jam();
1277       sectionCnt--;
1278       fragIdsPtrI = handle.m_ptr[sectionCnt].i;
1279       SectionReader fragsReader(fragIdsPtrI, getSectionSegmentPool());
1280 
1281       //Unpack into extended signal memory:
1282       const Uint32 fragCnt = signal->theData[25] = fragsReader.getSize();
1283       if (unlikely(!fragsReader.getWords(&signal->theData[26], fragCnt)))
1284       {
1285         jam();
1286         err = DbspjErr::InvalidRequest;
1287         break;
1288       }
1289     }
1290 
1291     {
1292       SectionReader treeReader(attrPtr, getSectionSegmentPool());
1293       SectionReader paramReader(attrPtr, getSectionSegmentPool());
1294       paramReader.step(len); // skip over tree to parameters
1295 
1296       Build_context ctx;
1297       ctx.m_resultRef = req->resultRef;
1298       ctx.m_scanPrio = ScanFragReq::getScanPrio(req->requestInfo);
1299       ctx.m_savepointId = req->savePointId;
1300       ctx.m_batch_size_rows = req->batch_size_rows;
1301       ctx.m_start_signal = signal;
1302       ctx.m_senderRef = signal->getSendersBlockRef();
1303 
1304       err = build(ctx, requestPtr, treeReader, paramReader);
1305       if (unlikely(err != 0))
1306         break;
1307 
1308       /**
1309        * Root TreeNode in Request takes ownership of keyPtr
1310        * section when build has completed.
1311        * We are done with attrPtr and MultiFrag-list which are
1312        * now released.
1313        */
1314       Ptr<TreeNode> rootNodePtr = ctx.m_node_list[0];
1315       if (sectionCnt > ScanFragReq::KeyInfoSectionNum)
1316       {
1317         jam();
1318         sectionCnt--;
1319         const Uint32 keyPtrI = handle.m_ptr[ScanFragReq::KeyInfoSectionNum].i;
1320         rootNodePtr.p->m_send.m_keyInfoPtrI = keyPtrI;
1321       }
1322       release(attrPtr);
1323       releaseSection(fragIdsPtrI); //MultiFrag list
1324       handle.clear();
1325     }
1326 
1327     /**
1328      * Store request in list(s)/hash(es)
1329      */
1330     store_scan(requestPtr);
1331 
1332     if (ERROR_INSERTED_CLEAR(17013) ||
1333         unlikely(!requestPtr.p->isScan() || requestPtr.p->m_node_cnt != cnt))
1334     {
1335       jam();
1336       err = DbspjErr::InvalidRequest;
1337       break;
1338     }
1339 
1340     prepare(signal, requestPtr);
1341     checkPrepareComplete(signal, requestPtr);
1342     return;
1343   } while (0);
1344 
1345   /**
1346    * Error handling below,
1347    *  'err' should contain error code.
1348    */
1349   ndbassert(err != 0);
1350   if (!requestPtr.isNull())
1351   {
1352     jam();
1353     cleanup(requestPtr);
1354   }
1355   releaseSections(handle);  // a NOOP, if we reached 'handle.clear()' above
1356   handle_early_scanfrag_ref(signal, req, err);
1357 }
1358 
1359 void
do_init(Request * requestP,const ScanFragReq * req,Uint32 senderRef)1360 Dbspj::do_init(Request* requestP, const ScanFragReq* req, Uint32 senderRef)
1361 {
1362   requestP->m_bits = Request::RT_SCAN;
1363   requestP->m_errCode = 0;
1364   requestP->m_state = Request::RS_BUILDING;
1365   requestP->m_node_cnt = 0;
1366   requestP->m_cnt_active = 0;
1367   requestP->m_rows = 0;
1368   requestP->m_active_tree_nodes.clear();
1369   requestP->m_completed_tree_nodes.set();
1370   requestP->m_outstanding = 0;
1371   requestP->m_senderRef = senderRef;
1372   requestP->m_senderData = req->senderData;
1373   requestP->m_transId[0] = req->transId1;
1374   requestP->m_transId[1] = req->transId2;
1375   requestP->m_rootResultData = req->resultData;
1376   requestP->m_rootFragId = req->fragmentNoKeyLen;
1377   requestP->m_rootFragCnt = 0; //Filled in later
1378   bzero(requestP->m_lookup_node_data, sizeof(requestP->m_lookup_node_data));
1379 #ifdef SPJ_TRACE_TIME
1380   requestP->m_cnt_batches = 0;
1381   requestP->m_sum_rows = 0;
1382   requestP->m_sum_running = 0;
1383   requestP->m_sum_waiting = 0;
1384   requestP->m_save_time = NdbTick_getCurrentTicks();
1385 #endif
1386 }
1387 
1388 void
store_scan(Ptr<Request> requestPtr)1389 Dbspj::store_scan(Ptr<Request> requestPtr)
1390 {
1391   ndbassert(requestPtr.p->isScan());
1392   Ptr<Request> tmp;
1393   bool found = m_scan_request_hash.find(tmp, *requestPtr.p);
1394   ndbrequire(found == false);
1395   m_scan_request_hash.add(requestPtr);
1396 }
1397 
1398 void
handle_early_scanfrag_ref(Signal * signal,const ScanFragReq * _req,Uint32 err)1399 Dbspj::handle_early_scanfrag_ref(Signal* signal,
1400                                  const ScanFragReq * _req,
1401                                  Uint32 err)
1402 {
1403   ScanFragReq req = *_req;
1404   Uint32 senderRef = signal->getSendersBlockRef();
1405 
1406   ScanFragRef * ref = (ScanFragRef*)&signal->theData[0];
1407   ref->senderData = req.senderData;
1408   ref->transId1 = req.transId1;
1409   ref->transId2 = req.transId2;
1410   ref->errorCode = err;
1411   sendSignal(senderRef, GSN_SCAN_FRAGREF, signal,
1412              ScanFragRef::SignalLength, JBB);
1413 }
1414 
1415 /**
1416  * END - MODULE SCAN_FRAGREQ
1417  */
1418 
1419 /**
1420  * MODULE GENERIC
1421  */
1422 Uint32
build(Build_context & ctx,Ptr<Request> requestPtr,SectionReader & tree,SectionReader & param)1423 Dbspj::build(Build_context& ctx,
1424              Ptr<Request> requestPtr,
1425              SectionReader & tree,
1426              SectionReader & param)
1427 {
1428   Uint32 tmp0, tmp1;
1429   Uint32 err = DbspjErr::ZeroLengthQueryTree;
1430   ctx.m_cnt = 0;
1431   ctx.m_scan_cnt = 0;
1432 
1433   tree.getWord(&tmp0);
1434   Uint32 loop = QueryTree::getNodeCnt(tmp0);
1435 
1436   DEBUG("::build()");
1437   err = DbspjErr::InvalidTreeNodeCount;
1438   if (loop == 0 || loop > NDB_SPJ_MAX_TREE_NODES)
1439   {
1440     jam();
1441     goto error;
1442   }
1443 
1444   while (ctx.m_cnt < loop)
1445   {
1446     DEBUG(" - loop " << ctx.m_cnt << " pos: " << tree.getPos().currPos);
1447     tree.peekWord(&tmp0);
1448     param.peekWord(&tmp1);
1449     Uint32 node_op = QueryNode::getOpType(tmp0);
1450     Uint32 node_len = QueryNode::getLength(tmp0);
1451     Uint32 param_op = QueryNodeParameters::getOpType(tmp1);
1452     Uint32 param_len = QueryNodeParameters::getLength(tmp1);
1453 
1454     err = DbspjErr::QueryNodeTooBig;
1455     if (unlikely(node_len >= NDB_ARRAY_SIZE(m_buffer0)))
1456     {
1457       jam();
1458       goto error;
1459     }
1460 
1461     err = DbspjErr::QueryNodeParametersTooBig;
1462     if (unlikely(param_len >= NDB_ARRAY_SIZE(m_buffer1)))
1463     {
1464       jam();
1465       goto error;
1466     }
1467 
1468     err = DbspjErr::InvalidTreeNodeSpecification;
1469     if (unlikely(tree.getWords(m_buffer0, node_len) == false))
1470     {
1471       jam();
1472       goto error;
1473     }
1474 
1475     err = DbspjErr::InvalidTreeParametersSpecification;
1476     if (unlikely(param.getWords(m_buffer1, param_len) == false))
1477     {
1478       jam();
1479       goto error;
1480     }
1481 
1482 #if defined(DEBUG_LQHKEYREQ) || defined(DEBUG_SCAN_FRAGREQ)
1483     printf("node: ");
1484     for (Uint32 i = 0; i<node_len; i++)
1485       printf("0x%.8x ", m_buffer0[i]);
1486     printf("\n");
1487 
1488     printf("param: ");
1489     for (Uint32 i = 0; i<param_len; i++)
1490       printf("0x%.8x ", m_buffer1[i]);
1491     printf("\n");
1492 #endif
1493 
1494     err = DbspjErr::UnknowQueryOperation;
1495     if (unlikely(node_op != param_op))
1496     {
1497       jam();
1498       goto error;
1499     }
1500     if (ERROR_INSERTED_CLEAR(17006))
1501     {
1502       ndbout_c("Injecting UnknowQueryOperation error 17006 at line %d file %s",
1503                 __LINE__,  __FILE__);
1504       jam();
1505       goto error;
1506     }
1507 
1508     const OpInfo* info = NULL;
1509     if (unlikely(node_op == QueryNode::QN_SCAN_FRAG_v1))
1510     {
1511       /**
1512        * Convert the deprecated SCAN_FRAG_v1 node+param to new SCAN_FRAG:
1513        *  - The 'node' formats are identical, no conversion needed.
1514        *  - The QN_ScanFragParameters has two additional 'batch_size' members.
1515        *    In addition there is three unused Uint32 member for future use. (5)
1516        *    Extend entire param block to make room for it, fill in from 'req'.
1517        *
1518        *    {len, requestInfo, resultData}
1519        *     -> {len, requestInfo, resultData,
1520        *         batch_size_rows, batch_size_bytes, unused0-2}
1521        */
1522       jam();
1523       QN_ScanFragParameters_v1 *param_old = (QN_ScanFragParameters_v1*)m_buffer1;
1524       const Uint32 requestInfo = param_old->requestInfo;
1525       const Uint32 resultData = param_old->resultData;
1526 
1527       if (unlikely(param_len+5 >= NDB_ARRAY_SIZE(m_buffer1)))
1528       {
1529         jam();
1530         err = DbspjErr::QueryNodeParametersTooBig;
1531         goto error;
1532       }
1533       QN_ScanFragParameters *param = (QN_ScanFragParameters*)m_buffer1;
1534       /* Moving data beyond 'NodeSize' after the space for new parameters */
1535       memmove(((Uint32*)param)+param->NodeSize,
1536               ((Uint32*)param_old)+param_old->NodeSize,
1537               (param_len-param_old->NodeSize) * sizeof(Uint32));
1538       param_len+=5;
1539 
1540       param->requestInfo = requestInfo;
1541       param->resultData = resultData;
1542 
1543       /* Calculate and fill in param 'batchSize' from request */
1544       Signal* signal = ctx.m_start_signal;
1545       const ScanFragReq* req = (const ScanFragReq*)(signal->getDataPtr());
1546       param->batch_size_rows = req->batch_size_rows;
1547       param->batch_size_bytes = req->batch_size_bytes;
1548       param->unused0 = 0;
1549       param->unused1 = 0;
1550       param->unused2 = 0;
1551 
1552       /* Execute root scan with full parallelism - as SCAN_FRAG_v1 always did */
1553       param->requestInfo |= QN_ScanFragParameters::SFP_PARALLEL;
1554 
1555       info = &Dbspj::g_ScanFragOpInfo;
1556     }
1557     else if (unlikely(node_op == QueryNode::QN_SCAN_INDEX_v1))
1558     {
1559       /**
1560        * Convert the deprecated SCAN_INDEX_v1 node+param to new SCAN_FRAG:
1561        *  - The 'node' formats are identical, no conversion needed.
1562        *  - The QN_ScanIndexParameters has split the single batchSize into
1563        *    two seperate 'batch_size' members and introduced an additional
1564        *    three unused Uint32 members for future use. (Total 4)
1565        *    Extend entire param block to make room for it,
1566        *    fill in from old batchSize argument.
1567        *
1568        *    {len, requestInfo, batchSize, resultData}
1569        *     -> {len, requestInfo, resultData,
1570        *         batch_size_rows, batch_size_bytes, unused0-2}
1571        */
1572       jam();
1573       QN_ScanIndexParameters_v1 *param_old = (QN_ScanIndexParameters_v1*)m_buffer1;
1574       const Uint32 requestInfo = param_old->requestInfo;
1575       const Uint32 batchSize = param_old->batchSize;
1576       const Uint32 resultData = param_old->resultData;
1577 
1578       if (unlikely(param_len+4 >= NDB_ARRAY_SIZE(m_buffer1)))
1579       {
1580         jam();
1581         err = DbspjErr::QueryNodeParametersTooBig;
1582         goto error;
1583       }
1584       QN_ScanFragParameters *param = (QN_ScanFragParameters*)m_buffer1;
1585       /* Moving data beyond 'NodeSize' after the space for new parameters */
1586       memmove(((Uint32*)param)+param->NodeSize,
1587               ((Uint32*)param_old)+param_old->NodeSize,
1588               (param_len-param_old->NodeSize) * sizeof(Uint32));
1589       param_len+=4;
1590 
1591       param->requestInfo = requestInfo;
1592       param->resultData = resultData;
1593       param->batch_size_rows = batchSize & ~(0xFFFFFFFF << QN_ScanIndexParameters_v1::BatchRowBits);
1594       param->batch_size_bytes = batchSize >> QN_ScanIndexParameters_v1::BatchRowBits;
1595       param->unused0 = 0;
1596       param->unused1 = 0;
1597       param->unused2 = 0;
1598 
1599       info = &Dbspj::g_ScanFragOpInfo;
1600     }
1601     else
1602     {
1603       info = getOpInfo(node_op);
1604       if (unlikely(info == NULL))
1605       {
1606         jam();
1607         goto error;
1608       }
1609     }
1610 
1611     QueryNode* qn = (QueryNode*)m_buffer0;
1612     QueryNodeParameters * qp = (QueryNodeParameters*)m_buffer1;
1613     qn->len = node_len;
1614     qp->len = param_len;
1615     err = (this->*(info->m_build))(ctx, requestPtr, qn, qp);
1616     if (unlikely(err != 0))
1617     {
1618       jam();
1619       goto error;
1620     }
1621 
1622     /**
1623      * only first node gets access to signal
1624      */
1625     ctx.m_start_signal = NULL;
1626 
1627     ndbrequire(ctx.m_cnt < NDB_ARRAY_SIZE(ctx.m_node_list));
1628     ctx.m_cnt++;
1629   }
1630   requestPtr.p->m_node_cnt = ctx.m_cnt;
1631 
1632   if (ctx.m_scan_cnt > 1)
1633   {
1634     jam();
1635     requestPtr.p->m_bits |= Request::RT_MULTI_SCAN;
1636   }
1637 
1638   // Set up the order of execution plan
1639   buildExecPlan(requestPtr);
1640 
1641   // Construct RowBuffers where required
1642   err = initRowBuffers(requestPtr);
1643   if (unlikely(err != 0))
1644   {
1645     jam();
1646     goto error;
1647   }
1648 
1649   return 0;
1650 
1651 error:
1652   jam();
1653   return err;
1654 }
1655 
1656 /**
1657  * initRowBuffers will decide row-buffering strategy, and init
1658  * the RowBuffers where required.
1659  */
1660 Uint32
initRowBuffers(Ptr<Request> requestPtr)1661 Dbspj::initRowBuffers(Ptr<Request> requestPtr)
1662 {
1663   jam();
1664 
1665   /**
1666    * Init BUFFERS iff Request has to buffer any rows/matches
1667    */
1668   if (requestPtr.p->m_bits & Request::RT_BUFFERS)
1669   {
1670     jam();
1671 
1672     /**
1673      * Iff, multi-scan is non-bushy (normal case)
1674      *   we don't strictly need BUFFER_VAR for RT_BUFFERS
1675      *   but could instead pop-row stack frame,
1676      *     however this is not implemented...
1677      *
1678      * so, currently use BUFFER_VAR if 'RT_MULTI_SCAN'
1679      *
1680      * NOTE: This should easily be solvable by having a
1681      *       RowBuffer for each TreeNode instead
1682      */
1683     if (requestPtr.p->m_bits & Request::RT_MULTI_SCAN)
1684     {
1685       jam();
1686       requestPtr.p->m_rowBuffer.init(BUFFER_VAR);
1687     }
1688     else
1689     {
1690       jam();
1691       requestPtr.p->m_rowBuffer.init(BUFFER_STACK);
1692     }
1693 
1694     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
1695     Ptr<TreeNode> treeNodePtr;
1696     for (list.first(treeNodePtr); !treeNodePtr.isNull(); list.next(treeNodePtr))
1697     {
1698       jam();
1699       ndbassert(treeNodePtr.p->m_batch_size > 0);
1700       /**
1701        * Construct a List or Map RowCollection for those TreeNodes
1702        * requiring rows to be buffered.
1703        */
1704       if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MAP)
1705       {
1706         jam();
1707         treeNodePtr.p->m_rows.construct (RowCollection::COLLECTION_MAP,
1708                                          requestPtr.p->m_rowBuffer,
1709                                          treeNodePtr.p->m_batch_size);
1710       }
1711       else if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ANY)
1712       {
1713         jam();
1714         treeNodePtr.p->m_rows.construct (RowCollection::COLLECTION_LIST,
1715                                          requestPtr.p->m_rowBuffer,
1716                                          treeNodePtr.p->m_batch_size);
1717       }
1718     }
1719   }
1720 
1721   return 0;
1722 } // Dbspj::initRowBuffers
1723 
1724 
1725 /**
1726  * setupAncestors():
1727  *
1728  * Complete the query tree topology as given by the SPJ API:
1729  *
1730  * Fill in the m_ancestors bitMask, and set the referrence to
1731  * our closest scanAncestor in each TreeNode. Also set
1732  * the 'm_coverage' of each TreeNode.
1733  */
1734 void
setupAncestors(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 scanAncestorPtrI)1735 Dbspj::setupAncestors(Ptr<Request>  requestPtr,
1736                       Ptr<TreeNode> treeNodePtr,
1737                       Uint32        scanAncestorPtrI)
1738 {
1739   LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
1740   Local_dependency_map const childList(pool, treeNodePtr.p->m_child_nodes);
1741   Dependency_map::ConstDataBufferIterator it;
1742 
1743   treeNodePtr.p->m_scanAncestorPtrI = scanAncestorPtrI;
1744   if (treeNodePtr.p->isScan())
1745   {
1746     scanAncestorPtrI = treeNodePtr.i;
1747   }
1748 
1749   for (childList.first(it); !it.isNull(); childList.next(it))
1750   {
1751     jam();
1752     Ptr<TreeNode> childPtr;
1753     m_treenode_pool.getPtr(childPtr, *it.data);
1754 
1755     childPtr.p->m_ancestors = treeNodePtr.p->m_ancestors;
1756     childPtr.p->m_ancestors.set(treeNodePtr.p->m_node_no);
1757 
1758     setupAncestors(requestPtr, childPtr, scanAncestorPtrI);
1759 
1760     treeNodePtr.p->m_coverage.bitOR(childPtr.p->m_coverage);
1761   }
1762   treeNodePtr.p->m_coverage.set(treeNodePtr.p->m_node_no);
1763 }
1764 
1765 
1766 /**
1767  * buildExecPlan()
1768  *
1769  *   Decides the order/pace in which the different TreeNodes should
1770  *   be executed. We basically choose between two strategies:
1771  *
1772  *   Lookup-queries returns at most a single row from each
1773  *   TreeNode in the SPJ-request. We believe these to impose
1774  *   a relatively low CPU load on the system. We try to reduce
1775  *   the elapsed execution time for these requests by
1776  *   submitting as many of the LQHKEYREQ's as possible in parallel.
1777  *   Thereby also taking advantage of the datanode parallelism.
1778  *
1779  *   On the other hand, scan queries has the potential for returning
1780  *   huge result sets. Furthermore, the root scan operation will
1781  *   result is SPJ sub requests being sent to all datanodes. Thus
1782  *   the datanode parallelism is utilized without executing
1783  *   the SPJ requests TreeNodes in parallel. For such queries
1784  *   we will execute INNER-joined TreeNodes in sequence, wherever
1785  *   possible taking advantage of that we can skip further operations
1786  *   on rows where preceeding matches were not found.
1787  *
1788  *   Note that prior to introducing INNER-join handling in SPJ,
1789  *   all queries effectively were executed with the most parallel
1790  *   execution plan.
1791  */
1792 Uint32
buildExecPlan(Ptr<Request> requestPtr)1793 Dbspj::buildExecPlan(Ptr<Request> requestPtr)
1794 {
1795   Ptr<TreeNode> treeRootPtr;
1796   Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
1797   list.first(treeRootPtr);
1798 
1799   /**
1800    * Brute force solution to ensure that all rows in
1801    * batch are sorted if requested:
1802    *
1803    * In a scan-scan (MULTI_SCAN) request the result is effectively
1804    * generated as a cross product between the scans. If the child-scans
1805    * batches need another NEXTREQ to retrieve remaining rows, the parent
1806    * scans result rows will effectively be repeated together with the new
1807    * rows from the child scans.
1808    * By restricting the parent scan to a batch size of one row, the
1809    * parent rows will still be sorted, even if multiple child batches
1810    * has to be fetched.
1811    */
1812   if (treeRootPtr.p->m_bits & TreeNode::T_SORTED_ORDER &&
1813       requestPtr.p->m_bits & Request::RT_MULTI_SCAN)
1814   {
1815     jam();
1816     ndbassert(treeRootPtr.p->m_bits & TreeNode::T_SCAN_PARALLEL);
1817     ScanFragData& data = treeRootPtr.p->m_scanFrag_data;
1818     ScanFragReq* const dst = reinterpret_cast<ScanFragReq*>(data.m_scanFragReq);
1819     dst->batch_size_rows = 1;
1820   }
1821 
1822   setupAncestors(requestPtr, treeRootPtr, RNIL);
1823 
1824   if (requestPtr.p->isScan())
1825   {
1826     const Uint32 err = planSequentialExec(requestPtr, treeRootPtr,
1827                                           NullTreeNodePtr, NullTreeNodePtr);
1828     if (unlikely(err))
1829       return err;
1830   }
1831   else
1832   {
1833     const Uint32 err = planParallelExec(requestPtr, treeRootPtr);
1834     if (unlikely(err))
1835       return err;
1836   }
1837 
1838 #ifdef VM_TRACE
1839   DEBUG("Execution plan, TreeNode execution order:");
1840   dumpExecPlan(requestPtr, treeRootPtr);
1841 #endif
1842 
1843   return 0;
1844 } // buildExecPlan()
1845 
1846 
1847 /**
1848  * planParallelExec():
1849  *
1850  *  Set up the most parallelized execution plan for the query.
1851  *  This happens to be the same query topology as represented by the
1852  *  child / parent references represented in SPJ request from the API.
1853  *  So we could simply copy the child / ancestor dependencies as
1854  *  the final order of execution.
1855  *
1856  *  For such an execution plan we may execute all child-TreeNodes in
1857  *  parallel - Even if there are non-matching child rows which will
1858  *  eventually result in both the parent row, and all adjacent child rows
1859  *  to be eliminated from a final inner-joined result set.
1860  *
1861  *  Such a join plan is most suited for a query processing relatively few
1862  *  rows, where the overhead of returning rows which are later eliminated
1863  *  is low. The possible advantage if this query plan is a lower elapsed time
1864  *  for the query execution, possible at the cost of some higher CPU usage.
1865  */
1866 Uint32
planParallelExec(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)1867 Dbspj::planParallelExec(Ptr<Request>  requestPtr,
1868                         Ptr<TreeNode> treeNodePtr)
1869 {
1870   LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
1871   Local_dependency_map child(pool, treeNodePtr.p->m_child_nodes);
1872   Local_dependency_map execList(pool, treeNodePtr.p->m_next_nodes);
1873   Dependency_map::ConstDataBufferIterator it;
1874 
1875   treeNodePtr.p->m_predecessors = treeNodePtr.p->m_ancestors;
1876   treeNodePtr.p->m_dependencies = treeNodePtr.p->m_ancestors;
1877 
1878   for (child.first(it); !it.isNull(); child.next(it))
1879   {
1880     Ptr<TreeNode> childPtr;
1881     m_treenode_pool.getPtr(childPtr, *it.data);
1882     if (unlikely(!execList.append(&childPtr.i, 1)))
1883     {
1884       jam();
1885       return DbspjErr::OutOfQueryMemory;
1886     }
1887 
1888     const Uint32 err = planParallelExec(requestPtr, childPtr);
1889     if (unlikely(err))
1890       return err;
1891 
1892     treeNodePtr.p->m_coverage.bitOR(childPtr.p->m_coverage);
1893   }
1894 
1895   return 0;
1896 } // Dbspj::planParallelExec
1897 
1898 
1899 /**
1900  * planSequentialExec()
1901  *
1902  *   Build an execution plan where INNER-joined TreeNodes are executed in
1903  *   sequence, such that further evaluation of not matching rows could be
1904  *   skipped as early as possible.
1905  *
1906  *  Steps:
1907  *
1908  * 1)
1909  *  Each 'branch' has the property that it starts with either a scan-TreeNode,
1910  *  or an outer joined (lookup-) TreeNode. Any INNER-joined lookup-nodes having
1911  *  this TreeNode as a (grand-)parent, is also a member of the branch.
1912  *
1913  *  Such a 'branch' of INNER-joined lookups has the property that an EQ-match
1914  *  has to be found from all its TreeNodes in order for any of the related
1915  *  rows to be part of the joined result set. Thus, during execution we can
1916  *  skip any further child lookups as soon as a non-match is found. This is
1917  *  represented in the execution plan by appending the INNER-joined lookups
1918  *  in a sequence.
1919  *
1920  *  Note that we are 'greedy' in appending these INNER-joined lookups,
1921  *  such that a lookup-TreeNode may effectively be executed prior to a
1922  *  scan-TreeNode, even if the scan is located before the lookup in the
1923  *  'm_nodes' list produced by the SPJ-API. This is intentional as a
1924  *  potential non-INNER-joined lookup row would eliminate the need for
1925  *  executing the much more expensive (index-)scan operation.
1926  *
1927  * 2)
1928  *  Recursively append a *single* INNER-joined scan-*branch* after the
1929  *  end of the branch from 1). As it is called recursively, the scan
1930  *  branch will append further lookup-nodes which depended on this scan-node,
1931  *  and finaly append any remaining INNER-joined scan branches.
1932  *
1933  *  Note1 that due to old legacy in the SPJ-API protocol, all scan nodes
1934  *  has to be executed in order relative to each other. (Explains the 'single'
1935  *  mentioned above)
1936  *
1937  *  Note2: After the two steps above has completed, including the recursive call
1938  *  handling the INNER-joined scan, all INNER-joined TreeNodes to be joined with
1939  *  this 'branch' have been added to the exec plan.
1940  *
1941  *  Note3: Below we use the term 'non-INNER-joined', instead of 'OUTER-joined'.
1942  *  This is due to SPJ-API protocol compatability, where we previously didn't
1943  *  tag the TreeNodes as being INNER-joined or not. Thus when receiving a SPJ
1944  *  request from an API client, we can't tell for sure whether the TreeNode
1945  *  is outer joined, or if the (old) client simply didn't specify INNER-joins.
1946  *  Thus all we know is that nodes are 'non-INNER-joined'.
1947  *
1948  *  Also note that for any request from such an old API client, there will
1949  *  not be appended any 'sequential' TreeNodes to the exec plan in 1) and 2)
1950  *  above. Only steps 3) and 4) below will effectively be used, which will
1951  *  (intentionaly) result in a parallelized query plan, identical to what
1952  *  it used to be prior to introducing these INNER-join optimizations.
1953  *
1954  * 3)
1955  *  Recursively append all non-INNER-joined lookup branches to be executed
1956  *  after the sequence of INNER-joined-lookups (from 1). Note that these
1957  *  branches are executed in sequence in a left -> right order, such
1958  *  that when the 'left' branch is completed, we 'RESUME' into the 'right'
1959  *  branch. This is done in order to avoid overflowing the job buffers
1960  *  due to too many LQHKEYREQ-signals being sent at once.
1961  *  The 'nextBranchPtr' is set up by this step as the 'right' lookup branch
1962  *  to RESUME. (See appendTreeNode() for more about RESUME handling)
1963  *
1964  * 4)
1965  *  Recursively append all non-INNER-joined scan branches to be executed
1966  *  in *parallel* after the sequence of INNER-joined-lookups (from 1).
1967  *  As we do not really handle OUTER-joined scans (yet), this is only
1968  *  in effect when we get a SPJ request from an old-API, which do not
1969  *  specify INNER-join for a scan-TreeNode. Thus the old type 'submit
1970  *  scan in parallel'-plan will be produced.
1971  *  For a client using the updated SPJ-API, all scans will be handled in 2)
1972  *
1973  */
1974 Uint32
planSequentialExec(Ptr<Request> requestPtr,const Ptr<TreeNode> branchPtr,Ptr<TreeNode> prevExecPtr,const Ptr<TreeNode> nextBranchPtr)1975 Dbspj::planSequentialExec(Ptr<Request>  requestPtr,
1976                           const Ptr<TreeNode> branchPtr,
1977                           Ptr<TreeNode> prevExecPtr,
1978                           const Ptr<TreeNode> nextBranchPtr)
1979 {
1980   DEBUG("planSequentialExec, start branch at treeNode no: " << branchPtr.p->m_node_no);
1981 
1982   // Append head of branch to be executed after 'prevExecPtr'
1983   const Uint32 err = appendTreeNode(requestPtr, branchPtr, prevExecPtr, nextBranchPtr);
1984   if (unlikely(err))
1985     return err;
1986 
1987   Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
1988   TreeNodeBitMask predecessors(branchPtr.p->m_predecessors);
1989   predecessors.set(branchPtr.p->m_node_no);
1990 
1991   /**
1992    * 1) Append all INNER-joined lookups to the 'plan' to be executed in sequence.
1993    * Maintain the set of 'predecessor' TreeNodes which are already executed.
1994    * Don't append TreeNodes where its ancestors are not part of the 'plan'
1995    */
1996   Ptr<TreeNode> treeNodePtr(branchPtr);
1997   prevExecPtr = treeNodePtr;
1998   while (list.next(treeNodePtr))
1999   {
2000     if (treeNodePtr.p->m_predecessors.isclear() &&
2001         predecessors.contains(treeNodePtr.p->m_ancestors) &&
2002         treeNodePtr.p->m_bits & TreeNode::T_INNER_JOIN &&
2003 	treeNodePtr.p->isLookup())
2004     {
2005 
2006       DEBUG("planSequentialExec, append INNER-join lookup treeNode: "
2007 	<< treeNodePtr.p->m_node_no
2008 	<< ", to branch at: " << branchPtr.p->m_node_no
2009         << ", as 'descendant' of node: " << prevExecPtr.p->m_node_no);
2010 
2011       // Add INNER-joined lookup treeNode to the join plan:
2012       const Uint32 err = appendTreeNode(requestPtr, treeNodePtr, prevExecPtr, nextBranchPtr);
2013       if (unlikely(err))
2014         return err;
2015 
2016       predecessors.set(treeNodePtr.p->m_node_no);
2017       prevExecPtr = treeNodePtr;
2018     }
2019   } //for 'all request TreeNodes', starting from branchPtr
2020 
2021   /**
2022    * 2) After this INNER-joined lookup sequence:
2023    * Recursively append a *single* INNER-joined scan-branch, if found.
2024    *
2025    * Note that this branch, including any non-INNER joined branches below,
2026    * are planned to be executed in *parallel* after the 'prevExecPtr',
2027    * which is the end of the sequence of INNER-lookups.
2028    */
2029   treeNodePtr = branchPtr;    //Start over
2030   while (list.next(treeNodePtr))
2031   {
2032     /**
2033      * Scan has to be executed in same order as found in the
2034      * list of TreeNodes. (Legacy of the original SPJ-API result protocol)
2035      */
2036     if (treeNodePtr.p->m_predecessors.isclear() &&
2037         predecessors.contains(treeNodePtr.p->m_ancestors) &&
2038 	treeNodePtr.p->m_bits & TreeNode::T_INNER_JOIN)
2039     {
2040       DEBUG("planSequentialExec, append INNER-joined scan-branch at treeNode: "
2041 	<< treeNodePtr.p->m_node_no);
2042 
2043       ndbassert(treeNodePtr.p->isScan());
2044       const Uint32 err = planSequentialExec(requestPtr, treeNodePtr, prevExecPtr,
2045                                             NullTreeNodePtr);
2046       if (unlikely(err))
2047         return err;
2048       break;
2049     }
2050   } //for 'all request TreeNodes', starting from branchPtr
2051 
2052 
2053   /**
2054    * Note: All INNER-Joins within current 'branch' will now have been handled,
2055    * either directly within this method at 1), or by recursively calling it in 2).
2056    *
2057    * 3a) collect any non-INNER-joined lookup branches
2058    */
2059   Ptr<TreeNode> outerBranches[NDB_SPJ_MAX_TREE_NODES+1];
2060   int outerCnt = 0;
2061 
2062   treeNodePtr = branchPtr;    //Start over
2063   while (list.next(treeNodePtr))
2064   {
2065     if (treeNodePtr.p->m_predecessors.isclear() &&
2066 	predecessors.contains(treeNodePtr.p->m_ancestors))
2067     {
2068       if (treeNodePtr.p->isLookup() &&
2069 	  !branchPtr.p->m_predecessors.contains(treeNodePtr.p->m_ancestors))
2070       {
2071         // A non-INNER joined lookup-TreeNode
2072         outerBranches[outerCnt++] = treeNodePtr;
2073       }
2074     }
2075   } //for 'all request TreeNodes', starting from branchPtr
2076 
2077   /**
2078    * 3b) Append the non-INNER-joined lookup branches to the end of the INNER-joined
2079    * lookup sequence, (at 'prevExecPtr'), will be executed in a sequence, parallell
2080    * with the scan branch from 2).
2081    *
2082    */
2083   outerBranches[outerCnt] = nextBranchPtr;                      //Resume point for last
2084   for (int i = 0; i < outerCnt; i++)
2085   {
2086     DEBUG("planSequentialExec, append non-INNER-joined branch no: "
2087       << outerBranches[i].p->m_node_no);
2088 
2089     const Uint32 err = planSequentialExec(requestPtr, outerBranches[i], prevExecPtr,
2090 				          outerBranches[i+1]);  //RESUME point
2091     if (unlikely(err))
2092       return err;
2093   }
2094 
2095   /**
2096    * 4) Append any non-INNER joined scan branches to the end of the INNER-joined
2097    * lookup sequence, (at 'prevExecPtr')
2098    */
2099   treeNodePtr = branchPtr;    //Start over
2100   while (list.next(treeNodePtr))
2101   {
2102     if (treeNodePtr.p->m_predecessors.isclear() &&
2103 	predecessors.contains(treeNodePtr.p->m_ancestors))
2104     {
2105       if (!branchPtr.p->m_predecessors.contains(treeNodePtr.p->m_ancestors))
2106       {
2107 	jam();
2108         ndbassert(treeNodePtr.p->isScan());
2109 
2110         DEBUG("planSequentialExec, append non-INNER-joined scan-treeNode: "
2111 	  << treeNodePtr.p->m_node_no
2112           << ", to branch at: " << branchPtr.p->m_node_no
2113           << ", as 'descendant' of node: " << prevExecPtr.p->m_node_no);
2114 
2115         const Uint32 err = planSequentialExec(requestPtr, treeNodePtr, prevExecPtr,
2116                                               NullTreeNodePtr);
2117         if (unlikely(err))
2118           return err;
2119       }
2120     }
2121   }
2122 
2123   return 0;
2124 } // ::planSequentialExec
2125 
2126 
2127 /**
2128  * appendTreeNode()
2129  *
2130  *  Appends 'treeNodePtr' to the execution plan after 'prevExecPtr'.
2131  *  In case 'treeNodePtr' is part of an outer joined tree branch,
2132  *  'nextBranchPtr' may refer a 'resume point' outside of the current
2133  *  outer joined branch.
2134  *
2135  *  In case of execution of a row set within the current branch is
2136  *  terminated due to no INNER-joined matches found, execution will be
2137  *  resumed at 'nextBranchPtr'
2138  *
2139  *  Fills in the 'predecessors' and 'dependencies' bitmask.
2140  *
2141  *  Sets of extra 'scheduling policy' described by 'm_resumeEvents'
2142  *  and 'm_resumePtrI', and BUFFERing of rows and/or their match bitmask
2143  *  as required by the choosen scheduling.
2144  */
2145 Uint32
appendTreeNode(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Ptr<TreeNode> prevExecPtr,const Ptr<TreeNode> nextBranchPtr)2146 Dbspj::appendTreeNode(Ptr<Request>  requestPtr,
2147                       Ptr<TreeNode> treeNodePtr,
2148                       Ptr<TreeNode> prevExecPtr,
2149                       const Ptr<TreeNode> nextBranchPtr)
2150 {
2151   if (prevExecPtr.isNull())
2152   {
2153     // Assert that no further action would have been required below.
2154     ndbassert(nextBranchPtr.isNull());
2155     ndbassert(treeNodePtr.p->m_parentPtrI == RNIL);
2156     ndbassert(treeNodePtr.p->m_scanAncestorPtrI == RNIL);
2157     return 0;
2158   }
2159 
2160   DEBUG("appendTreeNode, append treeNode: " << treeNodePtr.p->m_node_no
2161     << ", as 'descendant' of node: " << prevExecPtr.p->m_node_no);
2162   {
2163     LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
2164 
2165     // Add treeNode to the execution plan:
2166     Local_dependency_map execList(pool, prevExecPtr.p->m_next_nodes);
2167     if (unlikely(!execList.append(&treeNodePtr.i, 1)))
2168     {
2169       jam();
2170       return DbspjErr::OutOfQueryMemory;
2171     }
2172   }
2173 
2174   treeNodePtr.p->m_predecessors.bitOR(prevExecPtr.p->m_predecessors);
2175   treeNodePtr.p->m_predecessors.set(prevExecPtr.p->m_node_no);
2176 
2177   treeNodePtr.p->m_dependencies = prevExecPtr.p->m_dependencies;
2178   treeNodePtr.p->m_dependencies.set(prevExecPtr.p->m_node_no);
2179 
2180   ndbassert(treeNodePtr.p->m_predecessors.contains(treeNodePtr.p->m_dependencies));
2181   ndbassert(treeNodePtr.p->m_dependencies.contains(treeNodePtr.p->m_ancestors));
2182 
2183   /**
2184    * Below we set up any special scheduling policy.
2185    *
2186    * If nothing is set, completion of a request will submit new request(s) for
2187    * all 'm_next_nodes' in *parallel*. The result rows returned from the request
2188    * will be used directly as the 'parentRow' to produce the new request(s).
2189    *
2190    * So anything set up below is an exception to this basic rule!
2191    */
2192 
2193   /**
2194    * If a 'next branch' is specified, the current branch should start execution
2195    * from this branch when it completes. This is part of our load regulation logic
2196    * which prevents it from overflowing the job buffers due to a scan driven
2197    * star-join query topology submitting all its LQHKEYREQSs at once.
2198    *
2199    * Instead we now start only the first child lookup operation when a scan
2200    * completes. Completion of requests from this lookup operation will in turn
2201    * either start the next INNER-joined lookup when a TRANSID_AI result arrives,
2202    * or use the 'next branch'-RESUME logic set up below if not INNER-joined.
2203    * Together this maintain a steady pace of LQHKEYREQSs being submitted, where
2204    * the total number of submitted REQs in the pipeline will be <= number
2205    * of rows returned from the preceeding scan batch. (A 1::1 fanout)
2206    *
2207    * The 'next branch'-RESUME logic is controlled by setting the following
2208    * m_resumeEvents flags:
2209    *
2210    *  - TN_ENQUEUE_OP: The first TreeNode in a 'next branch' will enqueue
2211    *      the correlation-id of all rows TRANSID_AI-returned from its parent.
2212    *      (As opposed to submit it for immediate execution). Any of the
2213    *      RESUME-actions below will later pick one of the ENQUED rows
2214    *      for execution. (Also implies that the parent of any ENQUEUing-TreeNode
2215    *      need to BUFFER_ROW).
2216    *  - TN_RESUME_REF: If we get a LQHKEYREF-reply it terminate any further
2217    *      INNER-join operations originating from the head of this branch.
2218    *      As this frees a scheduling quota, we may start an operation from
2219    *      the nextBranch to be executed.
2220    *  - TN_RESUME_CONF: Set only for the last operation in the branch.
2221    *      When it succesfully completes, a scheduling quota is available,
2222    *      and we may start an operation from the nextBranch to be executed.
2223    */
2224   if (!nextBranchPtr.isNull())
2225   {
2226     // Should only be used for lookup resuming another branch of lookups,
2227     // Within the same scanAncestor scope.
2228     ndbassert(treeNodePtr.p->isLookup());
2229     ndbassert(nextBranchPtr.p->isLookup());
2230     ndbassert(nextBranchPtr.p->m_scanAncestorPtrI == treeNodePtr.p->m_scanAncestorPtrI);
2231 
2232     treeNodePtr.p->m_resumePtrI = nextBranchPtr.i;
2233     nextBranchPtr.p->m_predecessors.set(treeNodePtr.p->m_node_no);
2234 
2235     /**
2236      * Only the last TreeNode in a branch should have TN_RESUME_CONF set.
2237      * If we now append to a branch having a resume position, remove RESUME_CONF.
2238      */
2239     if (prevExecPtr.p->m_resumePtrI != RNIL)
2240     {
2241       ndbassert(prevExecPtr.p->m_resumeEvents & TreeNode::TN_RESUME_REF);
2242       // Only for the last resuming TreeNode
2243       prevExecPtr.p->m_resumeEvents &= ~TreeNode::TN_RESUME_CONF;
2244     }
2245 
2246     // Assume: Last node in this outer-joined tree branch: Always resume 'next'.
2247     treeNodePtr.p->m_resumeEvents |= TreeNode::TN_RESUME_CONF |
2248                                      TreeNode::TN_RESUME_REF;
2249 
2250     // The 'to be resumed' operations are enqueued at the head of nextBranch.
2251     nextBranchPtr.p->m_resumeEvents |= TreeNode::TN_ENQUEUE_OP;
2252   }
2253 
2254   /**    Example:
2255    *
2256    *       scan1
2257    *       /   \      ====INNER-join executed as===>  scan1 -> scan2 -> scan3
2258    *    scan2  scan3
2259    *
2260    * Considdering case above, both scan2 and scan3 has scan1 as its scanAncestor.
2261    * In an INNER-joined execution plan, we will take advantage of that
2262    * a match between scan1 join scan2 rows are required, else 'join scan3' could
2263    * be skipped. Thus, even if scan1 is the scan-ancestor of scan3, we will
2264    * execute scan2 inbetween these.
2265    *
2266    * Note that the result from scan2 may have multiple TRANSID_AI results returned
2267    * for each row from scan1. Thus we can't directly use the returned scan2 rows
2268    * to trigger production of the scan3 request. (Due to cardinality mismatch).
2269    * The scan3 request has to be produced based on scan1 results!
2270    *
2271    * We set up the scheduling policy below to solve this:
2272    * - TN_EXEC_WAIT is set on 'scan3', which will prevent TRANSID_AI
2273    *     results from scan2 from submiting operations to scan3.
2274    * - TN_RESUME_NODE is set on 'scan3' which will result in
2275    *     ::resumeBufferedNode() being called when all TreeNodes
2276    *     which we depends in has completed their batches.
2277    *     (Also implies that the parent of any to-be-resumed-nodes
2278    *      need to BUFFER_ROW).
2279    *
2280    * ::resumeBufferedNode() will iterate all its buffered parent results.
2281    * For each row we will check if the required INNER-join matches from
2282    * the TreeNodes it has INNER-join dependencies on. Non-matching parent
2283    * rows are skipped from further requests.
2284    *
2285    * We maintain the found matches in the m_match-bitmask in the
2286    * BUFFER structure of each TreeNode scanAncestor. Below we set
2287    * the T_BUFFER_MATCH on the scanAncestor, and all scans inbetween
2288    * in order to having the match-bitmap being set up.
2289    */
2290   if (treeNodePtr.p->isScan() &&
2291       treeNodePtr.p->m_scanAncestorPtrI != RNIL)
2292   {
2293     Ptr<TreeNode> scanAncestorPtr;
2294     m_treenode_pool.getPtr(scanAncestorPtr, treeNodePtr.p->m_scanAncestorPtrI);
2295     Ptr<TreeNode> ancestorPtr(scanAncestorPtr);
2296 
2297     // Note that scans are always added to exec plan such that their
2298     // relative order is kept.
2299 
2300     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2301     while (list.next(ancestorPtr) && ancestorPtr.i != treeNodePtr.i)
2302     {
2303       if (ancestorPtr.p->isScan() &&
2304 	  treeNodePtr.p->m_dependencies.get(ancestorPtr.p->m_node_no))
2305       {
2306 	/**
2307          * 'ancestorPtr' is a scan executed inbetween this scan and its scanAncestor.
2308          * It is not among the ancestors of the TreeNode to be executed
2309          */
2310 
2311         // Need 'resume-node' scheduling in preparation for 'next' scan-branch:
2312         treeNodePtr.p->m_resumeEvents |= TreeNode::TN_EXEC_WAIT |
2313 	                                 TreeNode::TN_RESUME_NODE;
2314 
2315         requestPtr.p->m_bits |= Request::RT_BUFFERS;
2316         scanAncestorPtr.p->m_bits |= TreeNode::T_BUFFER_MAP |
2317                                      TreeNode::T_BUFFER_MATCH;
2318 
2319 	/**
2320          * BUFFER_MATCH all scan ancestors of this treeNode which we
2321          * depends on (May exclude some outer-joined scan branches.)
2322          */
2323         if (!ancestorPtr.p->isLeaf())
2324         {
2325           ancestorPtr.p->m_bits |= TreeNode::T_BUFFER_MAP |
2326                                    TreeNode::T_BUFFER_MATCH;
2327         }
2328       }
2329     }
2330   }
2331 
2332   /**
2333    * Only the result rows from the 'prevExec' is directly available when
2334    * operations for this TreeNode is scheduled. If that is not the parent
2335    * of this TreeNode, we have to BUFFER the parent rows such that
2336    * they can be looked up by the correlationId when needed. NOTE, that
2337    * all Lookup result rows having the same scanAncestor, will also
2338    * share the same correlationId as their scanAncestor. Such that the
2339    * correlationId from a prevExec result row, may be used to
2340    * BUFFER_MAP-locate the related parent rows.
2341    *
2342    * Also take care of buffering parent rows for enqueued ops and
2343    * to-be-resumed nodes, as described above.
2344    */
2345   if (treeNodePtr.p->m_parentPtrI != prevExecPtr.i ||
2346       (treeNodePtr.p->m_resumeEvents & TreeNode::TN_ENQUEUE_OP) ||
2347       (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_NODE))
2348   {
2349     /**
2350      * As execution of this tree branch is not initiated by
2351      * its own parent, we need to buffer the parent rows
2352      * such that they can be located when needed.
2353      */
2354     Ptr<TreeNode> parentPtr;
2355     m_treenode_pool.getPtr(parentPtr, treeNodePtr.p->m_parentPtrI);
2356     parentPtr.p->m_bits |= TreeNode::T_BUFFER_MAP |
2357                            TreeNode::T_BUFFER_ROW;
2358     requestPtr.p->m_bits |= Request::RT_BUFFERS;
2359   }
2360 
2361   return 0;
2362 }
2363 
2364 
2365 void
dumpExecPlan(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)2366 Dbspj::dumpExecPlan(Ptr<Request>  requestPtr,
2367                     Ptr<TreeNode> treeNodePtr)
2368 {
2369   LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
2370   const Local_dependency_map nextExec(pool, treeNodePtr.p->m_next_nodes);
2371   Dependency_map::ConstDataBufferIterator it;
2372 
2373   DEBUG("TreeNode no: " << treeNodePtr.p->m_node_no
2374      << ", coverage are: " << treeNodePtr.p->m_coverage.rep.data[0]
2375      << ", ancestors are: " << treeNodePtr.p->m_ancestors.rep.data[0]
2376      << ", predecessors are: " << treeNodePtr.p->m_predecessors.rep.data[0]
2377      << ", depending on: " << treeNodePtr.p->m_dependencies.rep.data[0]
2378   );
2379 
2380   if (treeNodePtr.p->isLookup())
2381   {
2382     DEBUG("  'Lookup'-node");
2383   }
2384   else if (treeNodePtr.p->isScan())
2385   {
2386     DEBUG("  '(Index-)Scan'-node");
2387   }
2388 
2389   if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_EXEC_WAIT)
2390   {
2391     DEBUG("  has EXEC_WAIT");
2392   }
2393 
2394   if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_ENQUEUE_OP)
2395   {
2396     DEBUG("  ENQUEUE, wait to be resumed");
2397   }
2398   if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_NODE)
2399   {
2400     DEBUG("  has RESUME_NODE");
2401   }
2402   if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_CONF)
2403   {
2404     DEBUG("  has RESUME_CONF");
2405   }
2406   if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_REF)
2407   {
2408     DEBUG("  has RESUME_REF");
2409   }
2410 
2411   static const Uint32 BufferAll = (TreeNode::T_BUFFER_ROW|TreeNode::T_BUFFER_MATCH);
2412   if ((treeNodePtr.p->m_bits & BufferAll) == BufferAll)
2413   {
2414     DEBUG("  BUFFER 'ROWS'+'MATCH'");
2415   }
2416   else if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ROW)
2417   {
2418     DEBUG("  BUFFER 'ROWS'");
2419   }
2420   else if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MATCH)
2421   {
2422     DEBUG("  BUFFER 'MATCH'");
2423   }
2424 
2425   if (treeNodePtr.p->m_resumePtrI != RNIL)
2426   {
2427     Ptr<TreeNode> resumeTreeNodePtr;
2428     m_treenode_pool.getPtr(resumeTreeNodePtr, treeNodePtr.p->m_resumePtrI);
2429     DEBUG("  may resume node: " << resumeTreeNodePtr.p->m_node_no);
2430   }
2431 
2432   for (nextExec.first(it); !it.isNull(); nextExec.next(it))
2433   {
2434     Ptr<TreeNode> nextPtr;
2435     m_treenode_pool.getPtr(nextPtr, * it.data);
2436     DEBUG("  TreeNode no: " << treeNodePtr.p->m_node_no
2437        << ", has nextExec: " << nextPtr.p->m_node_no);
2438   }
2439 
2440   for (nextExec.first(it); !it.isNull(); nextExec.next(it))
2441   {
2442     Ptr<TreeNode> nextPtr;
2443     m_treenode_pool.getPtr(nextPtr, * it.data);
2444     dumpExecPlan(requestPtr, nextPtr);
2445   }
2446 }
2447 
2448 
2449 Uint32
createNode(Build_context & ctx,Ptr<Request> requestPtr,Ptr<TreeNode> & treeNodePtr)2450 Dbspj::createNode(Build_context& ctx, Ptr<Request> requestPtr,
2451                   Ptr<TreeNode> & treeNodePtr)
2452 {
2453   /**
2454    * In the future, we can have different TreeNode-allocation strategies
2455    *   that can be setup using the Build_context
2456    *
2457    */
2458   if (ERROR_INSERTED_CLEAR(17005))
2459   {
2460     ndbout_c("Injecting OutOfOperations error 17005 at line %d file %s",
2461              __LINE__,  __FILE__);
2462     jam();
2463     return DbspjErr::OutOfOperations;
2464   }
2465   if (m_treenode_pool.seize(requestPtr.p->m_arena, treeNodePtr))
2466   {
2467     DEBUG("createNode - seize -> ptrI: " << treeNodePtr.i);
2468     new (treeNodePtr.p) TreeNode(requestPtr.i);
2469     ctx.m_node_list[ctx.m_cnt] = treeNodePtr;
2470     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2471     list.addLast(treeNodePtr);
2472     treeNodePtr.p->m_node_no = ctx.m_cnt;
2473     return 0;
2474   }
2475   return DbspjErr::OutOfOperations;
2476 }
2477 
2478 /**
2479  * Depending on query type, a 'prepare' phase might be required
2480  * before starting the real data retrieval from the query.
2481  *
2482  * All ::exec<FOO> methods handling replies related to the query
2483  * prepare phase, should call ::checkPrepareComplete() before
2484  * they return.
2485  */
2486 void
prepare(Signal * signal,Ptr<Request> requestPtr)2487 Dbspj::prepare(Signal* signal,
2488                Ptr<Request> requestPtr)
2489 {
2490   Uint32 err = 0;
2491   if (requestPtr.p->m_bits & Request::RT_NEED_PREPARE)
2492   {
2493     jam();
2494     requestPtr.p->m_outstanding = 0;
2495     requestPtr.p->m_state = Request::RS_PREPARING;
2496 
2497     Ptr<TreeNode> nodePtr;
2498     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2499     for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
2500     {
2501       jam();
2502       /**
2503        * Verify existence of all involved tables.
2504        */
2505       err = checkTableError(nodePtr);
2506       if (unlikely(err))
2507       {
2508         jam();
2509         break;
2510       }
2511       if (nodePtr.p->m_bits & TreeNode::T_NEED_PREPARE)
2512       {
2513         jam();
2514         ndbassert(nodePtr.p->m_info != NULL);
2515         ndbassert(nodePtr.p->m_info->m_prepare != NULL);
2516         (this->*(nodePtr.p->m_info->m_prepare))(signal, requestPtr, nodePtr);
2517       }
2518     }
2519 
2520     /**
2521      * preferably RT_NEED_PREPARE should only be set if blocking
2522      * calls are used, in which case m_outstanding should have been increased
2523      */
2524     ndbassert(err || requestPtr.p->m_outstanding);
2525   }
2526   if (unlikely(err))
2527   {
2528     jam();
2529     abort(signal, requestPtr, err);
2530     return;
2531   }
2532 }
2533 
2534 /**
2535  * Check if all outstanding 'prepare' work has completed.
2536  * After prepare completion, start the query itself.
2537  *
2538  * A prepare completion could also complete the entire request.
2539  * Thus, checkBatchComplete() is also checked as part of
2540  * prepare completion.
2541  */
2542 void
checkPrepareComplete(Signal * signal,Ptr<Request> requestPtr)2543 Dbspj::checkPrepareComplete(Signal* signal, Ptr<Request> requestPtr)
2544 {
2545   if (requestPtr.p->m_outstanding > 0)
2546   {
2547     return;
2548   }
2549 
2550   do //To simplify error/exit handling, no real loop
2551   {
2552     jam();
2553     if (unlikely((requestPtr.p->m_state & Request::RS_ABORTING) != 0))
2554     {
2555       jam();
2556       break;
2557     }
2558 
2559     Ptr<TreeNode> nodePtr;
2560     {
2561       Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2562       ndbrequire(list.first(nodePtr));
2563     }
2564     Uint32 err = checkTableError(nodePtr);
2565     if (unlikely(err != 0))
2566     {
2567       jam();
2568       abort(signal, requestPtr, err);
2569       break;
2570     }
2571 
2572     requestPtr.p->m_state = Request::RS_RUNNING;
2573     ndbrequire(nodePtr.p->m_info != 0 && nodePtr.p->m_info->m_start != 0);
2574     (this->*(nodePtr.p->m_info->m_start))(signal, requestPtr, nodePtr);
2575   }
2576   while (0);
2577 
2578   //Possibly completed (or failed) entire request.
2579   checkBatchComplete(signal, requestPtr);
2580 }
2581 
2582 /**
2583  * Check if all outstanding work for 'Request' has completed.
2584  *
2585  * All ::exec<FOO> methods handling replies related to query
2586  * execution, *must* call ::checkBatchComplete() before returning.
2587  */
2588 void
checkBatchComplete(Signal * signal,Ptr<Request> requestPtr)2589 Dbspj::checkBatchComplete(Signal* signal, Ptr<Request> requestPtr)
2590 {
2591   if (unlikely(requestPtr.p->m_outstanding == 0))
2592   {
2593     jam();
2594     batchComplete(signal, requestPtr);
2595   }
2596 }
2597 
2598 /**
2599  * Request has completed all outstanding work.
2600  * Signal API about completion status and cleanup
2601  * resources if appropriate.
2602  *
2603  * NOTE: A Request might ::batchComplete() twice if
2604  * a completion phase is required. It will then be called
2605  * the last time from ::complete()
2606  */
2607 void
batchComplete(Signal * signal,Ptr<Request> requestPtr)2608 Dbspj::batchComplete(Signal* signal, Ptr<Request> requestPtr)
2609 {
2610   ndbrequire(requestPtr.p->m_outstanding == 0); // "definition" of batchComplete
2611 
2612   bool is_complete = requestPtr.p->m_cnt_active == 0;
2613   bool need_complete_phase = requestPtr.p->m_bits & Request::RT_NEED_COMPLETE;
2614 
2615   if (requestPtr.p->isLookup())
2616   {
2617     ndbassert(requestPtr.p->m_cnt_active == 0);
2618   }
2619 
2620   if (!is_complete || (is_complete && need_complete_phase == false))
2621   {
2622     /**
2623      * one batch complete, and either
2624      *   - request not complete
2625      *   - or not complete_phase needed
2626      */
2627     jam();
2628 
2629     if ((requestPtr.p->m_state & Request::RS_ABORTING) != 0)
2630     {
2631       ndbassert(is_complete);
2632     }
2633 
2634     prepareNextBatch(signal, requestPtr);
2635     sendConf(signal, requestPtr, is_complete);
2636   }
2637   else if (is_complete && need_complete_phase)
2638   {
2639     jam();
2640     /**
2641      * run complete-phase
2642      */
2643     complete(signal, requestPtr);
2644     return;
2645   }
2646 
2647   if (requestPtr.p->m_cnt_active == 0)
2648   {
2649     jam();
2650     /**
2651      * Entire Request completed
2652      */
2653     cleanup(requestPtr);
2654   }
2655   else
2656   {
2657     jam();
2658     /**
2659      * Cleanup the TreeNode branches getting another
2660      * batch of result rows.
2661      */
2662     cleanupBatch(requestPtr);
2663   }
2664 }
2665 
2666 /**
2667  * Locate next TreeNode(s) to retrieve more rows from.
2668  *
2669  *   Calculate set of the 'm_active_tree_nodes' we will receive from in NEXTREQ.
2670  *   Add these TreeNodes to the cursor list to be iterated.
2671  */
2672 void
prepareNextBatch(Signal * signal,Ptr<Request> requestPtr)2673 Dbspj::prepareNextBatch(Signal* signal, Ptr<Request> requestPtr)
2674 {
2675   requestPtr.p->m_cursor_nodes.init();
2676   requestPtr.p->m_active_tree_nodes.clear();
2677 
2678   if (requestPtr.p->m_cnt_active == 0)
2679   {
2680     jam();
2681     return;
2682   }
2683 
2684   DEBUG("prepareNextBatch, request: " << requestPtr.i);
2685 
2686   if (requestPtr.p->m_bits & Request::RT_REPEAT_SCAN_RESULT)
2687   {
2688     /**
2689      * If REPEAT_SCAN_RESULT we handle bushy scans by return more *new* rows
2690      * from only one of the active child scans. If there are multiple
2691      * bushy scans not being able to return their current result set in
2692      * a single batch, result sets from the other child scans are repeated
2693      * until all rows has been returned to the API client.
2694      *
2695      * Hence, the cross joined results from the bushy scans are partly
2696      * produced within the SPJ block on a 'batchsize granularity',
2697      * and partly is the responsibility of the API-client by iterating
2698      * the result rows within the current result batches.
2699      * (Opposed to non-REPEAT_SCAN_RESULT, the client only have to care about
2700      *  the current batched rows - no buffering is required)
2701      */
2702     jam();
2703     Ptr<TreeNode> nodePtr;
2704     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2705 
2706     /**
2707      * Locate last 'TN_ACTIVE' TreeNode which is the only one choosen
2708      * to return more *new* rows.
2709      */
2710     for (list.last(nodePtr); !nodePtr.isNull(); list.prev(nodePtr))
2711     {
2712       if (nodePtr.p->m_state == TreeNode::TN_ACTIVE)
2713       {
2714         jam();
2715         DEBUG("Will fetch more from 'active' m_node_no: " << nodePtr.p->m_node_no);
2716         /**
2717          * A later NEXTREQ will request a *new* batch of rows from this TreeNode.
2718          */
2719         registerActiveCursor(requestPtr, nodePtr);
2720         break;
2721       }
2722     }
2723 
2724     /**
2725      *  Restart/repeat other (fragment scan) child batches which:
2726      *    - Being 'after' nodePtr located above.
2727      *    - Not being an ancestor of (depends on) any 'active' TreeNode.
2728      *      (As these scans are started when rows from these parent nodes
2729      *      arrives.)
2730      */
2731     if (!nodePtr.isNull())
2732     {
2733       jam();
2734       DEBUG("Calculate 'active', w/ cursor on m_node_no: " << nodePtr.p->m_node_no);
2735 
2736       /* Restart any partial fragment-scans after this 'TN_ACTIVE' TreeNode */
2737       for (list.next(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
2738       {
2739 	jam();
2740         if (!nodePtr.p->m_predecessors.overlaps (requestPtr.p->m_active_tree_nodes))
2741         {
2742           jam();
2743           ndbrequire(nodePtr.p->m_state != TreeNode::TN_ACTIVE);
2744           ndbrequire(nodePtr.p->m_info != 0);
2745           if (nodePtr.p->m_info->m_parent_batch_repeat != 0)
2746           {
2747             jam();
2748             (this->*(nodePtr.p->m_info->m_parent_batch_repeat))(signal,
2749                                                                 requestPtr,
2750                                                                 nodePtr);
2751           }
2752         }
2753         /**
2754          * Adapt to SPJ-API protocol legacy:
2755          *   API always assumed that any node having an 'active' node as
2756          *   ancestor gets a new batch of result rows. So we didn't explicitly
2757          *   set the 'active' bit for these siblings, as it was implicit.
2758          *   In addition, we might now have (INNER-join) dependencies outside
2759          *   of the set of ancestor nodes. If such a dependent node, not being one
2760          *   of our ancestor,  is 'active' it will also re-activate this TreeNode.
2761          *   Has to inform the API about that.
2762          */
2763         else if (!nodePtr.p->m_ancestors.overlaps (requestPtr.p->m_active_tree_nodes))
2764         {
2765           requestPtr.p->m_active_tree_nodes.set(nodePtr.p->m_node_no);
2766         }
2767       }
2768     } // if (!nodePtr.isNull()
2769   }
2770   else  // not 'RT_REPEAT_SCAN_RESULT'
2771   {
2772     /**
2773      * If not REPEAT_SCAN_RESULT multiple active TreeNodes may return their
2774      * remaining result simultaneously. In case of bushy-scans, these
2775      * concurrent result streams are cross joins of each other
2776      * in SQL terms. In order to produce the cross joined result, it is
2777      * the responsibility of the API-client to buffer these streams and
2778      * iterate them to produce the cross join.
2779      */
2780     jam();
2781     Ptr<TreeNode> nodePtr;
2782     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2783     TreeNodeBitMask predecessors_of_active;
2784 
2785     for (list.last(nodePtr); !nodePtr.isNull(); list.prev(nodePtr))
2786     {
2787       /**
2788        * If we are active (i.e not consumed all rows originating
2789        *   from parent rows) and we are not in the set of parents
2790        *   for any active child:
2791        *
2792        * Then, this is a position that execSCAN_NEXTREQ should continue
2793        */
2794       if (nodePtr.p->m_state == TreeNode::TN_ACTIVE &&
2795          !predecessors_of_active.get(nodePtr.p->m_node_no))
2796       {
2797         jam();
2798         DEBUG("Add 'active' m_node_no: " << nodePtr.p->m_node_no);
2799         registerActiveCursor(requestPtr, nodePtr);
2800         predecessors_of_active.bitOR(nodePtr.p->m_predecessors);
2801       }
2802     }
2803   } // if (RT_REPEAT_SCAN_RESULT)
2804 
2805   DEBUG("Calculated 'm_active_tree_nodes': " << requestPtr.p->m_active_tree_nodes.rep.data[0]);
2806 }
2807 
2808 void
registerActiveCursor(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)2809 Dbspj::registerActiveCursor(Ptr<Request> requestPtr, Ptr<TreeNode> treeNodePtr)
2810 {
2811   Uint32 bit = treeNodePtr.p->m_node_no;
2812   ndbrequire(!requestPtr.p->m_active_tree_nodes.get(bit));
2813   requestPtr.p->m_active_tree_nodes.set(bit);
2814 
2815   Local_TreeNodeCursor_list list(m_treenode_pool, requestPtr.p->m_cursor_nodes);
2816 #ifdef VM_TRACE
2817   {
2818     Ptr<TreeNode> nodePtr;
2819     for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
2820     {
2821       ndbrequire(nodePtr.i != treeNodePtr.i);
2822     }
2823   }
2824 #endif
2825   list.addFirst(treeNodePtr);
2826 }
2827 
2828 void
sendConf(Signal * signal,Ptr<Request> requestPtr,bool is_complete)2829 Dbspj::sendConf(Signal* signal, Ptr<Request> requestPtr, bool is_complete)
2830 {
2831   if (requestPtr.p->isScan())
2832   {
2833     if (unlikely((requestPtr.p->m_state & Request::RS_WAITING) != 0))
2834     {
2835       jam();
2836       /**
2837        * We aborted request ourselves (due to node-failure ?)
2838        *   but TC haven't contacted us...so we can't reply yet...
2839        */
2840       ndbrequire(is_complete);
2841       ndbrequire((requestPtr.p->m_state & Request::RS_ABORTING) != 0);
2842       return;
2843     }
2844 
2845     if (requestPtr.p->m_errCode == 0)
2846     {
2847       jam();
2848       ScanFragConf * conf=
2849         reinterpret_cast<ScanFragConf*>(signal->getDataPtrSend());
2850       conf->senderData = requestPtr.p->m_senderData;
2851       conf->transId1 = requestPtr.p->m_transId[0];
2852       conf->transId2 = requestPtr.p->m_transId[1];
2853       conf->completedOps = requestPtr.p->m_rows;
2854       conf->fragmentCompleted = is_complete ? 1 : 0;
2855       conf->total_len = requestPtr.p->m_active_tree_nodes.rep.data[0];
2856 
2857       /**
2858        * Collect the map of nodes still having more rows to return.
2859        * Note that this 'activeMask' is returned as part of the
2860        * extended format of the ScanFragConf signal introduced in wl7636.
2861        * If returned to a TC node not yet upgraded, the extended part
2862        * of the ScanFragConf is simply ignored.
2863        */
2864       Uint32 activeMask = 0;
2865       Ptr<TreeNode> treeNodePtr;
2866       Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2867 
2868       for (list.first(treeNodePtr); !treeNodePtr.isNull(); list.next(treeNodePtr))
2869       {
2870         if (treeNodePtr.p->m_state == TreeNode::TN_ACTIVE)
2871         {
2872           DBUG_ASSERT(treeNodePtr.p->m_node_no <= 31);
2873           activeMask |= (1 << treeNodePtr.p->m_node_no);
2874         }
2875       }
2876       conf->activeMask = activeMask;
2877       c_Counters.incr_counter(CI_SCAN_BATCHES_RETURNED, 1);
2878       c_Counters.incr_counter(CI_SCAN_ROWS_RETURNED, requestPtr.p->m_rows);
2879 
2880 #ifdef SPJ_TRACE_TIME
2881       const NDB_TICKS now = NdbTick_getCurrentTicks();
2882       const NDB_TICKS then = requestPtr.p->m_save_time;
2883       const Uint64 diff = NdbTick_Elapsed(then,now).microSec();
2884 
2885       requestPtr.p->m_sum_rows += requestPtr.p->m_rows;
2886       requestPtr.p->m_sum_running += Uint32(diff);
2887       requestPtr.p->m_cnt_batches++;
2888       requestPtr.p->m_save_time = now;
2889 
2890       if (is_complete)
2891       {
2892         Uint32 cnt = requestPtr.p->m_cnt_batches;
2893         ndbout_c("batches: %u avg_rows: %u avg_running: %u avg_wait: %u",
2894                  cnt,
2895                  (requestPtr.p->m_sum_rows / cnt),
2896                  (requestPtr.p->m_sum_running / cnt),
2897                  cnt == 1 ? 0 : requestPtr.p->m_sum_waiting / (cnt - 1));
2898       }
2899 #endif
2900 
2901       /**
2902        * reset for next batch
2903        */
2904       requestPtr.p->m_rows = 0;
2905       if (!is_complete)
2906       {
2907         jam();
2908         requestPtr.p->m_state |= Request::RS_WAITING;
2909       }
2910 #ifdef DEBUG_SCAN_FRAGREQ
2911       ndbout_c("Dbspj::sendConf() sending SCAN_FRAGCONF ");
2912       printSCAN_FRAGCONF(stdout, signal->getDataPtrSend(),
2913                          conf->total_len,
2914                          DBLQH);
2915 #endif
2916       sendSignal(requestPtr.p->m_senderRef, GSN_SCAN_FRAGCONF, signal,
2917                  ScanFragConf::SignalLength_ext, JBB);
2918     }
2919     else
2920     {
2921       jam();
2922       ndbrequire(is_complete);
2923       ScanFragRef * ref=
2924         reinterpret_cast<ScanFragRef*>(signal->getDataPtrSend());
2925       ref->senderData = requestPtr.p->m_senderData;
2926       ref->transId1 = requestPtr.p->m_transId[0];
2927       ref->transId2 = requestPtr.p->m_transId[1];
2928       ref->errorCode = requestPtr.p->m_errCode;
2929 
2930       sendSignal(requestPtr.p->m_senderRef, GSN_SCAN_FRAGREF, signal,
2931                  ScanFragRef::SignalLength, JBB);
2932     }
2933   }
2934   else
2935   {
2936     ndbassert(is_complete);
2937     if (requestPtr.p->m_errCode)
2938     {
2939       jam();
2940       Uint32 resultRef = getResultRef(requestPtr);
2941       TcKeyRef* ref = (TcKeyRef*)signal->getDataPtr();
2942       ref->connectPtr = requestPtr.p->m_senderData;
2943       ref->transId[0] = requestPtr.p->m_transId[0];
2944       ref->transId[1] = requestPtr.p->m_transId[1];
2945       ref->errorCode = requestPtr.p->m_errCode;
2946       ref->errorData = 0;
2947 
2948       sendTCKEYREF(signal, resultRef, requestPtr.p->m_senderRef);
2949     }
2950   }
2951 
2952   if (ERROR_INSERTED(17531))
2953   {
2954     /**
2955      * Takes effect for *next* 'long' SPJ signal which will fail
2956      * to alloc long mem section. Dbspj::execSIGNAL_DROPPED_REP()
2957      * will then be called, which is what we intend to test here.
2958      */
2959     jam();
2960     ErrorSignalReceive= DBSPJ;
2961     ErrorMaxSegmentsToSeize= 1;
2962   }
2963 }
2964 
2965 Uint32
getResultRef(Ptr<Request> requestPtr)2966 Dbspj::getResultRef(Ptr<Request> requestPtr)
2967 {
2968   Ptr<TreeNode> nodePtr;
2969   Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2970   for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
2971   {
2972     if (nodePtr.p->isLookup())
2973     {
2974       jam();
2975       return nodePtr.p->m_lookup_data.m_api_resultRef;
2976     }
2977   }
2978   ndbabort();
2979   return 0;
2980 }
2981 
2982 /**
2983  * Cleanup resources in preparation for a SCAN_NEXTREQ
2984  * requesting a new batch of rows.
2985  */
2986 void
cleanupBatch(Ptr<Request> requestPtr)2987 Dbspj::cleanupBatch(Ptr<Request> requestPtr)
2988 {
2989   /**
2990    * Needs to be atleast 1 active otherwise we should have
2991    *   taken the Request cleanup "path" in batchComplete
2992    */
2993   ndbassert(requestPtr.p->m_cnt_active >= 1);
2994 
2995   /**
2996    * Release any buffered rows for the TreeNode branches
2997    * getting new rows.
2998    */
2999   if ((requestPtr.p->m_bits & Request::RT_BUFFERS) != 0)
3000   {
3001     if ((requestPtr.p->m_bits & Request::RT_MULTI_SCAN) != 0)
3002     {
3003       jam();
3004       /**
3005        * A MULTI_SCAN may selectively retrieve rows from only
3006        * some of the (scan-) branches in the Request.
3007        * Selectively release from only these branches.
3008        */
3009       releaseScanBuffers(requestPtr);
3010     }
3011     else
3012     {
3013       jam();
3014       /**
3015        * if not multiple scans in request, simply release all pages allocated
3016        * for row buffers (all rows will be released anyway)
3017        */
3018       // Root node should be the one and only being active
3019       ndbassert(requestPtr.p->m_cnt_active == 1);
3020       ndbassert(requestPtr.p->m_active_tree_nodes.get(0));
3021       releaseRequestBuffers(requestPtr);
3022     }
3023   } //RT_BUFFERS
3024 
3025   Ptr<TreeNode> treeNodePtr;
3026   Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3027 
3028   for (list.first(treeNodePtr); !treeNodePtr.isNull(); list.next(treeNodePtr))
3029   {
3030     /**
3031      * Re-init row buffer structures for those treeNodes getting more rows
3032      * in the following NEXTREQ, including all its childs.
3033      */
3034     if (requestPtr.p->m_active_tree_nodes.get(treeNodePtr.p->m_node_no) ||
3035         requestPtr.p->m_active_tree_nodes.overlaps(treeNodePtr.p->m_predecessors))
3036     {
3037       jam();
3038       treeNodePtr.p->m_rows.init();
3039     }
3040 
3041     /* Clear parents 'm_matched' bit for all buffered rows: */
3042     if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MATCH)
3043     {
3044       RowIterator iter;
3045       for (first(treeNodePtr.p->m_rows, iter); !iter.isNull(); next(iter))
3046       {
3047         jam();
3048         RowPtr row;
3049         setupRowPtr(treeNodePtr, row,
3050                     iter.m_base.m_ref, iter.m_base.m_row_ptr);
3051 
3052         row.m_matched->bitANDC(requestPtr.p->m_active_tree_nodes);
3053       }
3054     }
3055 
3056     /**
3057      * Do further cleanup in treeNodes having predecessors getting more rows.
3058      * (Which excludes the restarted treeNode itself)
3059      */
3060     if (requestPtr.p->m_active_tree_nodes.overlaps(treeNodePtr.p->m_predecessors))
3061     {
3062       jam();
3063       /**
3064        * Common TreeNode cleanup:
3065        * Deferred operations will have correlation ids which may refer
3066        * buffered rows released above. These are allocated in
3067        * the m_batchArena released below.
3068        * As an optimization we do not explicitly 'release()' these
3069        * correlation id's:
3070        *  - There could easily be some hundreds of them, released
3071        *    one by one in loop.
3072        *  - At the innermost level the release() is more or less a NOOP
3073        *    as Arena allocated memory cant be released for reuse.
3074        */
3075       m_arenaAllocator.release(treeNodePtr.p->m_batchArena);
3076       treeNodePtr.p->m_deferred.init();
3077 
3078       /**
3079        * TreeNode-type specific cleanup.
3080        */
3081       if (treeNodePtr.p->m_info->m_parent_batch_cleanup != 0)
3082       {
3083         jam();
3084         (this->*(treeNodePtr.p->m_info->m_parent_batch_cleanup))(requestPtr,
3085                                                                  treeNodePtr);
3086       }
3087     }
3088   }
3089 }
3090 
3091 void
releaseScanBuffers(Ptr<Request> requestPtr)3092 Dbspj::releaseScanBuffers(Ptr<Request> requestPtr)
3093 {
3094   Ptr<TreeNode> treeNodePtr;
3095   Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3096 
3097   for (list.first(treeNodePtr); !treeNodePtr.isNull(); list.next(treeNodePtr))
3098   {
3099     /**
3100      * Release buffered rows for all treeNodes getting more rows
3101      * in the following NEXTREQ, including all its childs.
3102      */
3103     if (requestPtr.p->m_active_tree_nodes.get(treeNodePtr.p->m_node_no) ||
3104         requestPtr.p->m_active_tree_nodes.overlaps(treeNodePtr.p->m_predecessors))
3105     {
3106       if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ANY)
3107       {
3108         jam();
3109         releaseNodeRows(requestPtr, treeNodePtr);
3110       }
3111     }
3112   }
3113 }
3114 
3115 void
releaseNodeRows(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)3116 Dbspj::releaseNodeRows(Ptr<Request> requestPtr, Ptr<TreeNode> treeNodePtr)
3117 {
3118   /**
3119    * Release all rows associated with tree node
3120    */
3121   DEBUG("releaseNodeRows"
3122      << ", node: " << treeNodePtr.p->m_node_no
3123      << ", request: " << requestPtr.i
3124   );
3125 
3126   ndbassert(treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ANY);
3127 
3128   Uint32 cnt = 0;
3129   RowIterator iter;
3130   for (first(treeNodePtr.p->m_rows, iter); !iter.isNull(); )
3131   {
3132     jam();
3133     RowRef pos = iter.m_base.m_ref;
3134     next(iter);
3135     releaseRow(treeNodePtr, pos);
3136     cnt ++;
3137   }
3138   DEBUG("RowIterator: released " << cnt << " rows!");
3139 
3140   if (treeNodePtr.p->m_rows.m_type == RowCollection::COLLECTION_MAP)
3141   {
3142     jam();
3143     // Release the (now empty) RowMap
3144     RowMap& map = treeNodePtr.p->m_rows.m_map;
3145     if (!map.isNull())
3146     {
3147       jam();
3148       RowRef ref;
3149       map.copyto(ref);
3150       releaseRow(treeNodePtr, ref);  // Map was allocated in row memory
3151     }
3152   }
3153 }
3154 
3155 void
releaseRow(Ptr<TreeNode> treeNodePtr,RowRef pos)3156 Dbspj::releaseRow(Ptr<TreeNode> treeNodePtr, RowRef pos)
3157 {
3158   // Only when var-alloc, or else stack will be popped wo/ consideration
3159   // to individual rows
3160   const RowCollection& collection = treeNodePtr.p->m_rows;
3161   ndbassert(collection.m_base.m_rowBuffer != NULL);
3162   ndbassert(collection.m_base.m_rowBuffer->m_type == BUFFER_VAR);
3163   ndbassert(pos.m_alloc_type == BUFFER_VAR);
3164 
3165   RowBuffer& rowBuffer = *collection.m_base.m_rowBuffer;
3166   Ptr<RowPage> ptr;
3167   m_page_pool.getPtr(ptr, pos.m_page_id);
3168   ((Var_page*)ptr.p)->free_record(pos.m_page_pos, Var_page::CHAIN);
3169   Uint32 free_space = ((Var_page*)ptr.p)->free_space;
3170   if (free_space == Var_page::DATA_WORDS - 1)
3171   {
3172     jam();
3173     Local_RowPage_fifo list(m_page_pool,
3174                                   rowBuffer.m_page_list);
3175     const bool last = list.hasNext(ptr) == false;
3176     list.remove(ptr);
3177     if (list.isEmpty())
3178     {
3179       jam();
3180       /**
3181        * Don't remove last page...
3182        */
3183       list.addLast(ptr);
3184       rowBuffer.m_var.m_free = free_space;
3185     }
3186     else
3187     {
3188       jam();
3189       if (last)
3190       {
3191         jam();
3192         /**
3193          * If we were last...set m_var.m_free to free_space of newLastPtr
3194          */
3195         Ptr<RowPage> newLastPtr;
3196         ndbrequire(list.last(newLastPtr));
3197         rowBuffer.m_var.m_free = ((Var_page*)newLastPtr.p)->free_space;
3198       }
3199       releasePage(ptr);
3200     }
3201   }
3202   else if (free_space > rowBuffer.m_var.m_free)
3203   {
3204     jam();
3205     Local_RowPage_fifo list(m_page_pool,
3206                                   rowBuffer.m_page_list);
3207     list.remove(ptr);
3208     list.addLast(ptr);
3209     rowBuffer.m_var.m_free = free_space;
3210   }
3211 }
3212 
3213 void
releaseRequestBuffers(Ptr<Request> requestPtr)3214 Dbspj::releaseRequestBuffers(Ptr<Request> requestPtr)
3215 {
3216   DEBUG("releaseRequestBuffers"
3217      << ", request: " << requestPtr.i
3218   );
3219   /**
3220    * Release all pages for request
3221    */
3222   {
3223     Local_RowPage_list freelist(m_page_pool, m_free_page_list);
3224     freelist.prependList(requestPtr.p->m_rowBuffer.m_page_list);
3225   }
3226   requestPtr.p->m_rowBuffer.reset();
3227 }
3228 
3229 /**
3230  * Handle that batch for this 'TreeNode' is complete.
3231  */
3232 void
handleTreeNodeComplete(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)3233 Dbspj::handleTreeNodeComplete(Signal * signal, Ptr<Request> requestPtr,
3234                               Ptr<TreeNode> treeNodePtr)
3235 {
3236   if ((requestPtr.p->m_state & Request::RS_ABORTING) == 0)
3237   {
3238     jam();
3239     ndbassert(requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
3240     ndbassert(treeNodePtr.p->m_deferred.isEmpty());
3241 
3242     /**
3243      * If all predecessors are complete, this has to be reported
3244      * as we might be waiting for this condition to start more
3245      * operations.
3246      */
3247     if (requestPtr.p->m_completed_tree_nodes.contains(treeNodePtr.p->m_predecessors))
3248     {
3249       jam();
3250       reportAncestorsComplete(signal, requestPtr, treeNodePtr);
3251     }
3252   }
3253 }
3254 
3255 /**
3256  * Notify any TreeNode(s) to be executed after the completed
3257  * TreeNode that their predecessors has completed their batch.
3258  */
3259 void
reportAncestorsComplete(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)3260 Dbspj::reportAncestorsComplete(Signal * signal, Ptr<Request> requestPtr,
3261                                Ptr<TreeNode> treeNodePtr)
3262 {
3263   DEBUG("reportAncestorsComplete: " << treeNodePtr.p->m_node_no);
3264 
3265   {
3266     jam();
3267     LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
3268     Local_dependency_map nextExec(pool, treeNodePtr.p->m_next_nodes);
3269     Dependency_map::ConstDataBufferIterator it;
3270 
3271     for (nextExec.first(it); !it.isNull(); nextExec.next(it))
3272     {
3273       jam();
3274       Ptr<TreeNode> nextTreeNodePtr;
3275       m_treenode_pool.getPtr(nextTreeNodePtr, *it.data);
3276 
3277       /**
3278        * Notify all TreeNodes which depends on the completed predecessors.
3279        */
3280       if (requestPtr.p->m_completed_tree_nodes.contains(nextTreeNodePtr.p->m_predecessors))
3281       {
3282         ndbassert(nextTreeNodePtr.p->m_deferred.isEmpty());
3283 
3284         if (nextTreeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_NODE)
3285         {
3286           resumeBufferedNode(signal, requestPtr, nextTreeNodePtr);
3287         }
3288 
3289         /* Notify only TreeNodes which has requested a completion notify. */
3290         if (nextTreeNodePtr.p->m_bits & TreeNode::T_NEED_REPORT_BATCH_COMPLETED)
3291         {
3292           jam();
3293           ndbassert(nextTreeNodePtr.p->m_info != NULL);
3294           ndbassert(nextTreeNodePtr.p->m_info->m_parent_batch_complete != NULL);
3295           (this->*(nextTreeNodePtr.p->m_info->m_parent_batch_complete))(signal,
3296                                                                  requestPtr,
3297                                                                  nextTreeNodePtr);
3298         }
3299         reportAncestorsComplete(signal, requestPtr, nextTreeNodePtr);
3300       }
3301     }
3302   }
3303 }
3304 
3305 /**
3306  * Set the Request to ABORTING state, and where appropriate,
3307  * inform any participating LDMs about the decission to
3308  * terminate the query.
3309  *
3310  * NOTE: No reply is yet sent to the API. This is taken care of by
3311  * the outermost ::exec<FOO> methods calling either ::checkPrepareComplete()
3312  * or ::checkBatchComplete(), which send a CONF/REF reply when all
3313  * 'outstanding' work is done.
3314  */
3315 void
abort(Signal * signal,Ptr<Request> requestPtr,Uint32 errCode)3316 Dbspj::abort(Signal* signal, Ptr<Request> requestPtr, Uint32 errCode)
3317 {
3318   jam();
3319   if ((requestPtr.p->m_state & Request::RS_ABORTING) != 0)
3320   {
3321     jam();
3322     return;
3323   }
3324 
3325   requestPtr.p->m_state |= Request::RS_ABORTING;
3326   requestPtr.p->m_errCode = errCode;
3327 
3328   {
3329     Ptr<TreeNode> nodePtr;
3330     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3331     for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
3332     {
3333       jam();
3334       ndbrequire(nodePtr.p->m_info != 0);
3335       if (nodePtr.p->m_info->m_abort != 0)
3336       {
3337         jam();
3338         (this->*(nodePtr.p->m_info->m_abort))(signal, requestPtr, nodePtr);
3339       }
3340     }
3341   }
3342 }
3343 
3344 Uint32
nodeFail(Signal * signal,Ptr<Request> requestPtr,NdbNodeBitmask nodes)3345 Dbspj::nodeFail(Signal* signal, Ptr<Request> requestPtr,
3346                 NdbNodeBitmask nodes)
3347 {
3348   Uint32 cnt = 0;
3349   Uint32 iter = 0;
3350 
3351   {
3352     Ptr<TreeNode> nodePtr;
3353     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3354     for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
3355     {
3356       jam();
3357       ndbrequire(nodePtr.p->m_info != 0);
3358       if (nodePtr.p->m_info->m_execNODE_FAILREP != 0)
3359       {
3360         jam();
3361         iter ++;
3362         cnt += (this->*(nodePtr.p->m_info->m_execNODE_FAILREP))(signal,
3363                                                                 requestPtr,
3364                                                                 nodePtr, nodes);
3365       }
3366     }
3367   }
3368 
3369   if (cnt == 0)
3370   {
3371     jam();
3372     /**
3373      * None of the operations needed NodeFailRep "action"
3374      *   check if our TC has died...but...only needed in
3375      *   scan case...for lookup...not so...
3376      */
3377     if (requestPtr.p->isLookup())
3378     {
3379       jam();
3380       return 0;  //Lookup: Don't care about TC still alive
3381     }
3382     else if (!nodes.get(refToNode(requestPtr.p->m_senderRef)))
3383     {
3384       jam();
3385       return 0;  //Scan: Requesting TC is still alive.
3386     }
3387   }
3388 
3389   jam();
3390   abort(signal, requestPtr, DbspjErr::NodeFailure);
3391   checkBatchComplete(signal, requestPtr);
3392 
3393   return cnt + iter;
3394 }
3395 
3396 void
complete(Signal * signal,Ptr<Request> requestPtr)3397 Dbspj::complete(Signal* signal, Ptr<Request> requestPtr)
3398 {
3399   /**
3400    * we need to run complete-phase before sending last SCAN_FRAGCONF
3401    */
3402   Uint32 flags = requestPtr.p->m_state &
3403     (Request::RS_ABORTING | Request::RS_WAITING);
3404 
3405   requestPtr.p->m_state = Request::RS_COMPLETING | flags;
3406 
3407   // clear bit so that next batchComplete()
3408   // will continue to cleanup
3409   ndbassert((requestPtr.p->m_bits & Request::RT_NEED_COMPLETE) != 0);
3410   requestPtr.p->m_bits &= ~(Uint32)Request::RT_NEED_COMPLETE;
3411   ndbassert(requestPtr.p->m_outstanding == 0);
3412   requestPtr.p->m_outstanding = 0;
3413   {
3414     Ptr<TreeNode> nodePtr;
3415     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3416     for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
3417     {
3418       jam();
3419       if (nodePtr.p->m_bits & TreeNode::T_NEED_COMPLETE)
3420       {
3421         jam();
3422         ndbassert(nodePtr.p->m_info != NULL);
3423         ndbassert(nodePtr.p->m_info->m_complete != NULL);
3424         (this->*(nodePtr.p->m_info->m_complete))(signal, requestPtr, nodePtr);
3425       }
3426     }
3427   }
3428 
3429   jam();
3430   checkBatchComplete(signal, requestPtr);
3431 }
3432 
3433 /**
3434  * Release as much as possible of sub objects owned by this Request,
3435  * including its TreeNodes.
3436  * The Request itself is *not* released yet as it may still be needed
3437  * to track the state of the request. (Set to include RS_DONE)
3438  */
3439 void
cleanup(Ptr<Request> requestPtr)3440 Dbspj::cleanup(Ptr<Request> requestPtr)
3441 {
3442   ndbrequire(requestPtr.p->m_cnt_active == 0);
3443   {
3444     Ptr<TreeNode> nodePtr;
3445     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3446     while (list.removeFirst(nodePtr))
3447     {
3448       jam();
3449       ndbrequire(nodePtr.p->m_info != 0 && nodePtr.p->m_info->m_cleanup != 0);
3450       (this->*(nodePtr.p->m_info->m_cleanup))(requestPtr, nodePtr);
3451 
3452       m_treenode_pool.release(nodePtr);
3453     }
3454   }
3455   if (requestPtr.p->isScan())
3456   {
3457     jam();
3458 
3459     /**
3460      * If a Request in state RS_WAITING is aborted (node failure?),
3461      * there is no ongoing client request we can reply to.
3462      * We set it to RS_ABORTED state now, a later SCAN_NEXTREQ will
3463      * find the RS_ABORTED request, REF with the abort reason, and
3464      * then complete the cleaning up
3465      *
3466      * NOTE1: If no SCAN_NEXTREQ ever arrives for this Request, it
3467      *        is effectively leaked!
3468      *
3469      * NOTE2: During testing I was never able to find any SCAN_NEXTREQ
3470      *        arriving for a ABORTED query. So there likely are such
3471      *        leaks! Suspect that TC does not send SCAN_NEXTREQ to
3472      *        SPJ/LQH blocks affected by a node failure?
3473      */
3474     if (unlikely((requestPtr.p->m_state & Request::RS_WAITING) != 0))
3475     {
3476       jam();
3477       requestPtr.p->m_state = Request::RS_ABORTED;
3478       return;
3479     }
3480     m_scan_request_hash.remove(requestPtr, *requestPtr.p);
3481   }
3482   else
3483   {
3484     jam();
3485     m_lookup_request_hash.remove(requestPtr, *requestPtr.p);
3486   }
3487   releaseRequestBuffers(requestPtr);
3488   ArenaHead ah = requestPtr.p->m_arena;
3489   m_request_pool.release(requestPtr);
3490   m_arenaAllocator.release(ah);
3491 }
3492 
3493 void
cleanup_common(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)3494 Dbspj::cleanup_common(Ptr<Request> requestPtr, Ptr<TreeNode> treeNodePtr)
3495 {
3496   jam();
3497 
3498   // Release TreeNode object allocated in the Request 'global' m_arena.
3499   // (Actualy obsolete by entire Request::m_arena released later)
3500   LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
3501   {
3502     Local_dependency_map list(pool, treeNodePtr.p->m_child_nodes);
3503     list.release();
3504   }
3505 
3506   {
3507     Local_pattern_store pattern(pool, treeNodePtr.p->m_keyPattern);
3508     pattern.release();
3509   }
3510 
3511   {
3512     Local_pattern_store pattern(pool, treeNodePtr.p->m_attrParamPattern);
3513     pattern.release();
3514   }
3515 
3516   // Correlation ids for deferred operations are allocated in the batch specific
3517   // arena. It is sufficient to release entire memory arena.
3518   m_arenaAllocator.release(treeNodePtr.p->m_batchArena);
3519 
3520   if (treeNodePtr.p->m_send.m_keyInfoPtrI != RNIL)
3521   {
3522     jam();
3523     releaseSection(treeNodePtr.p->m_send.m_keyInfoPtrI);
3524   }
3525 
3526   if (treeNodePtr.p->m_send.m_attrInfoPtrI != RNIL)
3527   {
3528     jam();
3529     releaseSection(treeNodePtr.p->m_send.m_attrInfoPtrI);
3530   }
3531 }
3532 
3533 static
3534 bool
spjCheckFailFunc(const char * predicate,const char * file,const unsigned line,const Uint32 instance)3535 spjCheckFailFunc(const char* predicate,
3536                  const char* file,
3537                  const unsigned line,
3538                  const Uint32 instance)
3539 {
3540   g_eventLogger->info("DBSPJ %u : Failed spjCheck (%s) "
3541                       "at line %u of %s.",
3542                       instance,
3543                       predicate,
3544                       line,
3545                       file);
3546   return false;
3547 }
3548 
3549 #define spjCheck(check)                                      \
3550   ((check)?true:                                             \
3551    spjCheckFailFunc(#check, __FILE__, __LINE__, instance())) \
3552 
3553 
3554 bool
checkRequest(const Ptr<Request> requestPtr)3555 Dbspj::checkRequest(const Ptr<Request> requestPtr)
3556 {
3557   jam();
3558 
3559   /**
3560    * We check the request, with individual assertions
3561    * affecting the overall result code
3562    * We attempt to dump the request if there's a problem
3563    * Dumping is done last to avoid problems with iterating
3564    * lists concurrently + IntrusiveList.
3565    * So checks should record the problem type etc, but not
3566    * ndbabort() immediately.  See spjCheck() above.
3567    */
3568 
3569   bool result = true;
3570 
3571   {
3572     Ptr<TreeNode> treeNodePtr;
3573     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3574     for (list.first(treeNodePtr);
3575          !treeNodePtr.isNull();
3576          list.next(treeNodePtr))
3577     {
3578       jam();
3579       ndbrequire(treeNodePtr.p->m_info != NULL);
3580       if (treeNodePtr.p->m_info->m_checkNode != NULL)
3581       {
3582         jam();
3583         result &= (this->*(treeNodePtr.p->m_info->m_checkNode))
3584           (requestPtr, treeNodePtr);
3585       }
3586     }
3587   }
3588 
3589   if (!result)
3590   {
3591     dumpRequest("failed checkRequest()",
3592                 requestPtr);
3593     ndbabort();
3594   }
3595 
3596   return result;
3597 }
3598 
3599 /**
3600  * Processing of signals from LQH
3601  */
3602 void
execLQHKEYREF(Signal * signal)3603 Dbspj::execLQHKEYREF(Signal* signal)
3604 {
3605   jamEntry();
3606 
3607   const LqhKeyRef* ref = reinterpret_cast<const LqhKeyRef*>(signal->getDataPtr());
3608 
3609   Ptr<TreeNode> treeNodePtr;
3610   m_treenode_pool.getPtr(treeNodePtr, ref->connectPtr);
3611 
3612   Ptr<Request> requestPtr;
3613   m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3614   ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
3615 
3616   ndbassert(checkRequest(requestPtr));
3617 
3618   DEBUG("execLQHKEYREF"
3619      << ", node: " << treeNodePtr.p->m_node_no
3620      << ", request: " << requestPtr.i
3621      << ", errorCode: " << ref->errorCode
3622   );
3623 
3624   ndbrequire(treeNodePtr.p->m_info && treeNodePtr.p->m_info->m_execLQHKEYREF);
3625   (this->*(treeNodePtr.p->m_info->m_execLQHKEYREF))(signal,
3626                                                     requestPtr,
3627                                                     treeNodePtr);
3628   jam();
3629   checkBatchComplete(signal, requestPtr);
3630 }
3631 
3632 void
execLQHKEYCONF(Signal * signal)3633 Dbspj::execLQHKEYCONF(Signal* signal)
3634 {
3635   jamEntry();
3636 
3637   const LqhKeyConf* conf = reinterpret_cast<const LqhKeyConf*>(signal->getDataPtr());
3638   Ptr<TreeNode> treeNodePtr;
3639   m_treenode_pool.getPtr(treeNodePtr, conf->opPtr);
3640 
3641   Ptr<Request> requestPtr;
3642   m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3643   ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
3644 
3645   DEBUG("execLQHKEYCONF"
3646      << ", node: " << treeNodePtr.p->m_node_no
3647      << ", request: " << requestPtr.i
3648   );
3649 
3650   ndbrequire(treeNodePtr.p->m_info && treeNodePtr.p->m_info->m_execLQHKEYCONF);
3651   (this->*(treeNodePtr.p->m_info->m_execLQHKEYCONF))(signal,
3652                                                      requestPtr,
3653                                                      treeNodePtr);
3654   jam();
3655   checkBatchComplete(signal, requestPtr);
3656 }
3657 
3658 void
execSCAN_FRAGREF(Signal * signal)3659 Dbspj::execSCAN_FRAGREF(Signal* signal)
3660 {
3661   jamEntry();
3662   const ScanFragRef* ref = reinterpret_cast<const ScanFragRef*>(signal->getDataPtr());
3663 
3664   Ptr<ScanFragHandle> scanFragHandlePtr;
3665   m_scanfraghandle_pool.getPtr(scanFragHandlePtr, ref->senderData);
3666   Ptr<TreeNode> treeNodePtr;
3667   m_treenode_pool.getPtr(treeNodePtr, scanFragHandlePtr.p->m_treeNodePtrI);
3668   Ptr<Request> requestPtr;
3669   m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3670   ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
3671 
3672   ndbassert(checkRequest(requestPtr));
3673 
3674   DEBUG("execSCAN_FRAGREF"
3675      << ", node: " << treeNodePtr.p->m_node_no
3676      << ", request: " << requestPtr.i
3677      << ", errorCode: " << ref->errorCode
3678   );
3679 
3680   ndbrequire(treeNodePtr.p->m_info&&treeNodePtr.p->m_info->m_execSCAN_FRAGREF);
3681   (this->*(treeNodePtr.p->m_info->m_execSCAN_FRAGREF))(signal,
3682                                                        requestPtr,
3683                                                        treeNodePtr,
3684                                                        scanFragHandlePtr);
3685   jam();
3686   checkBatchComplete(signal, requestPtr);
3687 }
3688 
3689 void
execSCAN_HBREP(Signal * signal)3690 Dbspj::execSCAN_HBREP(Signal* signal)
3691 {
3692   jamEntry();
3693 
3694   Uint32 senderData = signal->theData[0];
3695   //Uint32 transId[2] = { signal->theData[1], signal->theData[2] };
3696 
3697   Ptr<ScanFragHandle> scanFragHandlePtr;
3698   m_scanfraghandle_pool.getPtr(scanFragHandlePtr, senderData);
3699   Ptr<TreeNode> treeNodePtr;
3700   m_treenode_pool.getPtr(treeNodePtr, scanFragHandlePtr.p->m_treeNodePtrI);
3701   Ptr<Request> requestPtr;
3702   m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3703   DEBUG("execSCAN_HBREP"
3704      << ", node: " << treeNodePtr.p->m_node_no
3705      << ", request: " << requestPtr.i
3706   );
3707 
3708   Uint32 ref = requestPtr.p->m_senderRef;
3709   signal->theData[0] = requestPtr.p->m_senderData;
3710   sendSignal(ref, GSN_SCAN_HBREP, signal, 3, JBB);
3711 }
3712 
3713 void
execSCAN_FRAGCONF(Signal * signal)3714 Dbspj::execSCAN_FRAGCONF(Signal* signal)
3715 {
3716   jamEntry();
3717 
3718   const ScanFragConf* conf = reinterpret_cast<const ScanFragConf*>(signal->getDataPtr());
3719 
3720 #ifdef DEBUG_SCAN_FRAGREQ
3721   ndbout_c("Dbspj::execSCAN_FRAGCONF() receiving SCAN_FRAGCONF ");
3722   printSCAN_FRAGCONF(stdout, signal->getDataPtrSend(),
3723                      conf->total_len,
3724                      DBLQH);
3725 #endif
3726 
3727   Ptr<ScanFragHandle> scanFragHandlePtr;
3728   m_scanfraghandle_pool.getPtr(scanFragHandlePtr, conf->senderData);
3729   Ptr<TreeNode> treeNodePtr;
3730   m_treenode_pool.getPtr(treeNodePtr, scanFragHandlePtr.p->m_treeNodePtrI);
3731   Ptr<Request> requestPtr;
3732   m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3733 
3734   ndbassert(checkRequest(requestPtr));
3735 
3736   ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no) ||
3737             requestPtr.p->m_state & Request::RS_ABORTING);
3738 
3739   DEBUG("execSCAN_FRAGCONF"
3740      << ", node: " << treeNodePtr.p->m_node_no
3741      << ", request: " << requestPtr.i
3742   );
3743 
3744   ndbrequire(treeNodePtr.p->m_info&&treeNodePtr.p->m_info->m_execSCAN_FRAGCONF);
3745   (this->*(treeNodePtr.p->m_info->m_execSCAN_FRAGCONF))(signal,
3746                                                         requestPtr,
3747                                                         treeNodePtr,
3748                                                         scanFragHandlePtr);
3749   jam();
3750   checkBatchComplete(signal, requestPtr);
3751 }
3752 
3753 void
execSCAN_NEXTREQ(Signal * signal)3754 Dbspj::execSCAN_NEXTREQ(Signal* signal)
3755 {
3756   jamEntry();
3757   const ScanFragNextReq * req = (ScanFragNextReq*)&signal->theData[0];
3758 
3759 #ifdef DEBUG_SCAN_FRAGREQ
3760   DEBUG("Incomming SCAN_NEXTREQ");
3761   printSCANFRAGNEXTREQ(stdout, &signal->theData[0],
3762                        ScanFragNextReq::SignalLength, DBLQH);
3763 #endif
3764 
3765   Request key;
3766   key.m_transId[0] = req->transId1;
3767   key.m_transId[1] = req->transId2;
3768   key.m_senderData = req->senderData;
3769 
3770   Ptr<Request> requestPtr;
3771   if (unlikely(!m_scan_request_hash.find(requestPtr, key)))
3772   {
3773     jam();
3774     ndbrequire(ScanFragNextReq::getCloseFlag(req->requestInfo));
3775     return;
3776   }
3777   DEBUG("execSCAN_NEXTREQ, request: " << requestPtr.i);
3778 
3779 #ifdef SPJ_TRACE_TIME
3780   const NDB_TICKS now = NdbTick_getCurrentTicks();
3781   const NDB_TICKS then = requestPtr.p->m_save_time;
3782   const Uint64 diff = NdbTick_Elapsed(then,now).microSec();
3783   requestPtr.p->m_sum_waiting += Uint32(diff);
3784   requestPtr.p->m_save_time = now;
3785 #endif
3786 
3787   ndbassert(checkRequest(requestPtr));
3788 
3789   Uint32 state = requestPtr.p->m_state;
3790   requestPtr.p->m_state = state & ~Uint32(Request::RS_WAITING);
3791 
3792   do //Not a loop, allows 'break' to common exit/error handling.
3793   {
3794     /**
3795      * A RS_ABORTED query is a 'toombstone' left behind when a
3796      * RS_WAITING query was aborted by node failues. The idea is
3797      * that the next SCAN_NEXTREQ will reply with the abort reason
3798      * and clean up.
3799      *
3800      * TODO: This doesn't seems to happen as assumed by design,
3801      *       Thus, RS_ABORTED queries are likely leaked!
3802      */
3803     if (unlikely(state == Request::RS_ABORTED))
3804     {
3805       jam();
3806       break;
3807     }
3808     if (unlikely((state & Request::RS_ABORTING) != 0))
3809     {
3810       /**
3811        * abort is already in progress...
3812        *   since RS_WAITING is cleared...it will end this request
3813        */
3814       jam();
3815       break;
3816     }
3817     if (ScanFragNextReq::getCloseFlag(req->requestInfo)) // Requested close scan
3818     {
3819       jam();
3820       abort(signal, requestPtr, 0); //Stop query, no error
3821       break;
3822     }
3823 
3824     ndbrequire((state & Request::RS_WAITING) != 0);
3825     ndbrequire(requestPtr.p->m_outstanding == 0);
3826 
3827     /**
3828      * Scroll all relevant cursors...
3829      */
3830     Ptr<TreeNode> treeNodePtr;
3831     Local_TreeNodeCursor_list list(m_treenode_pool,
3832                                    requestPtr.p->m_cursor_nodes);
3833     Uint32 cnt_active = 0;
3834 
3835     for (list.first(treeNodePtr); !treeNodePtr.isNull(); list.next(treeNodePtr))
3836     {
3837       if (treeNodePtr.p->m_state == TreeNode::TN_ACTIVE)
3838       {
3839         jam();
3840         DEBUG("SCAN_NEXTREQ on TreeNode: "
3841            << ", m_node_no: " << treeNodePtr.p->m_node_no
3842            << ", w/ m_parentPtrI: " << treeNodePtr.p->m_parentPtrI);
3843 
3844         ndbrequire(treeNodePtr.p->m_info != 0 &&
3845                    treeNodePtr.p->m_info->m_execSCAN_NEXTREQ != 0);
3846         (this->*(treeNodePtr.p->m_info->m_execSCAN_NEXTREQ))(signal,
3847                                                              requestPtr,
3848                                                              treeNodePtr);
3849         cnt_active++;
3850       }
3851       else
3852       {
3853         /**
3854          * Restart any other scans not being 'TN_ACTIVE'
3855          * (Only effective if 'RT_REPEAT_SCAN_RESULT')
3856          */
3857         jam();
3858         ndbrequire(requestPtr.p->m_bits & Request::RT_REPEAT_SCAN_RESULT);
3859         DEBUG("Restart TreeNode "
3860            << ", m_node_no: " << treeNodePtr.p->m_node_no
3861            << ", w/ m_parentPtrI: " << treeNodePtr.p->m_parentPtrI);
3862 
3863         ndbrequire(treeNodePtr.p->m_info != 0 &&
3864                    treeNodePtr.p->m_info->m_parent_batch_complete !=0 );
3865         (this->*(treeNodePtr.p->m_info->m_parent_batch_complete))(signal,
3866                                                                   requestPtr,
3867                                                                   treeNodePtr);
3868       }
3869       if (unlikely((requestPtr.p->m_state & Request::RS_ABORTING) != 0))
3870       {
3871         jam();
3872         break;
3873       }
3874     }// for all treeNodes in 'm_cursor_nodes'
3875 
3876     /* Expected only a single ACTIVE TreeNode among the cursors */
3877     ndbrequire(cnt_active == 1 ||
3878                !(requestPtr.p->m_bits & Request::RT_REPEAT_SCAN_RESULT));
3879   }
3880   while (0);
3881 
3882   // If nothing restarted, or failed, we have to handle completion
3883   jam();
3884   checkBatchComplete(signal, requestPtr);
3885 }
3886 
3887 void
execTRANSID_AI(Signal * signal)3888 Dbspj::execTRANSID_AI(Signal* signal)
3889 {
3890   jamEntry();
3891   TransIdAI * req = (TransIdAI *)signal->getDataPtr();
3892   Uint32 ptrI = req->connectPtr;
3893 
3894   Ptr<TreeNode> treeNodePtr;
3895   m_treenode_pool.getPtr(treeNodePtr, ptrI);
3896   Ptr<Request> requestPtr;
3897   m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3898 
3899   ndbassert(checkRequest(requestPtr));
3900   ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
3901   ndbassert(treeNodePtr.p->m_bits & TreeNode::T_EXPECT_TRANSID_AI);
3902 
3903   DEBUG("execTRANSID_AI"
3904      << ", node: " << treeNodePtr.p->m_node_no
3905      << ", request: " << requestPtr.i
3906   );
3907 
3908   ndbrequire(signal->getNoOfSections() != 0);
3909 
3910   SegmentedSectionPtr dataPtr;
3911   {
3912     SectionHandle handle(this, signal);
3913     handle.getSection(dataPtr, 0);
3914     handle.clear();
3915   }
3916 
3917 #if defined(DEBUG_LQHKEYREQ) || defined(DEBUG_SCAN_FRAGREQ)
3918   printf("execTRANSID_AI: ");
3919   print(dataPtr, stdout);
3920 #endif
3921 
3922   /**
3923    * Register signal as arrived.
3924    */
3925   ndbassert(treeNodePtr.p->m_info&&treeNodePtr.p->m_info->m_countSignal);
3926   (this->*(treeNodePtr.p->m_info->m_countSignal))(signal,
3927                                                   requestPtr,
3928                                                   treeNodePtr, 1);
3929 
3930   /**
3931    * build easy-access-array for row
3932    */
3933   Uint32 tmp[2+MAX_ATTRIBUTES_IN_TABLE];
3934   RowPtr::Header* header = CAST_PTR(RowPtr::Header, &tmp[0]);
3935 
3936   Uint32 cnt = buildRowHeader(header, dataPtr);
3937   ndbassert(header->m_len < NDB_ARRAY_SIZE(tmp));
3938 
3939   struct RowPtr row;
3940   row.m_type = RowPtr::RT_SECTION;
3941   row.m_matched = NULL;
3942   row.m_src_node_ptrI = treeNodePtr.i;
3943   row.m_row_data.m_section.m_header = header;
3944   row.m_row_data.m_section.m_dataPtr.assign(dataPtr);
3945 
3946   getCorrelationData(row.m_row_data.m_section,
3947                      cnt - 1,
3948                      row.m_src_correlation);
3949 
3950   do  //Dummy loop to allow 'break' into error handling
3951   {
3952     if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ANY)
3953     {
3954       jam();
3955       Uint32 err;
3956 
3957       DEBUG("Need to storeRow"
3958         << ", node: " << treeNodePtr.p->m_node_no
3959       );
3960 
3961       if (ERROR_INSERTED(17120) ||
3962          (ERROR_INSERTED(17121) && treeNodePtr.p->m_parentPtrI != RNIL) ||
3963          (ERROR_INSERTED(17122) && refToNode(signal->getSendersBlockRef()) != getOwnNodeId()))
3964       {
3965         jam();
3966         CLEAR_ERROR_INSERT_VALUE;
3967         abort(signal, requestPtr, DbspjErr::OutOfRowMemory);
3968         break;
3969       }
3970       else if ((err = storeRow(treeNodePtr, row)) != 0)
3971       {
3972         jam();
3973         abort(signal, requestPtr, err);
3974         break;
3975       }
3976     }
3977     common_execTRANSID_AI(signal, requestPtr, treeNodePtr, row);
3978   }
3979   while(0);
3980 
3981   release(dataPtr);
3982 
3983   /**
3984    * When TreeNode is completed we might have to reply, or
3985    * resume other parts of the request.
3986    */
3987   if (requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no))
3988   {
3989     jam();
3990     handleTreeNodeComplete(signal, requestPtr, treeNodePtr);
3991   }
3992 
3993   jam();
3994   checkBatchComplete(signal, requestPtr);
3995 }
3996 
3997 Uint32
storeRow(Ptr<TreeNode> treeNodePtr,const RowPtr & row)3998 Dbspj::storeRow(Ptr<TreeNode> treeNodePtr, const RowPtr &row)
3999 {
4000   ndbassert(row.m_type == RowPtr::RT_SECTION);
4001   RowCollection& collection = treeNodePtr.p->m_rows;
4002   SegmentedSectionPtr dataPtr = row.m_row_data.m_section.m_dataPtr;
4003   Uint32 datalen;
4004   Uint32 *headptr;
4005   Uint32 headlen;
4006 
4007   Uint32 tmpHeader[2];
4008   if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ROW)
4009   {
4010     headptr = (Uint32*)row.m_row_data.m_section.m_header;
4011     headlen = 1 + row.m_row_data.m_section.m_header->m_len;
4012     datalen = dataPtr.sz;
4013   }
4014   else
4015   {
4016     // Build a header for only the 1-word correlation
4017     RowPtr::Header *header = CAST_PTR(RowPtr::Header, &tmpHeader[0]);
4018     header->m_len = 1;
4019     header->m_offset[0] = 0;
4020     headptr = (Uint32*)header;
4021     headlen = 1 + header->m_len;
4022 
4023     // 2 words: AttributeHeader + CorrelationId
4024     datalen = 2;
4025   }
4026 
4027   /**
4028    * Rows might be stored at an offset within the collection.
4029    * Calculate size to allocate for buffer.
4030    */
4031   const Uint32 offset = collection.rowOffset();
4032   const Uint32 matchlen =
4033      (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MATCH) ? 1 : 0;
4034   const Uint32 totlen = offset + matchlen + headlen + datalen;
4035 
4036   RowRef ref;
4037   Uint32* dstptr = rowAlloc(*collection.m_base.m_rowBuffer, ref, totlen);
4038   if (unlikely(dstptr == NULL))
4039   {
4040     jam();
4041     return DbspjErr::OutOfRowMemory;
4042   }
4043   Uint32 * const saved_dstptr = dstptr;
4044   dstptr += offset;
4045 
4046   // Insert 'MATCH', Header and 'ROW'/correlationId as specified
4047   if (matchlen > 0)
4048   {
4049     TreeNodeBitMask matched(treeNodePtr.p->m_dependencies);
4050     matched.set(treeNodePtr.p->m_node_no);
4051     memcpy(dstptr, &matched, 4 * matchlen);
4052     dstptr += matchlen;
4053   }
4054 
4055   memcpy(dstptr, headptr, 4 * headlen);
4056   dstptr += headlen;
4057 
4058   if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ROW)
4059   {
4060     //Store entire row, include correlationId (last column)
4061     copy(dstptr, dataPtr);
4062   }
4063   else
4064   {
4065     //Store only the correlation-id if not 'BUFFER_ROW':
4066     const RowPtr::Header *header = row.m_row_data.m_section.m_header;
4067     const Uint32 pos = header->m_offset[header->m_len-1];
4068     SectionReader reader(dataPtr, getSectionSegmentPool());
4069     ndbrequire(reader.step(pos));
4070     ndbrequire(reader.getWords(dstptr, 2));
4071   }
4072 
4073   /**
4074    * Register row in a list or a correlationId searchable 'map'
4075    * Note that add_to_xxx may relocate entire memory area which
4076    * 'dstptr' referred, so it is not safe to use 'dstptr' *after*
4077    * the add_to_* below.
4078    */
4079   if (collection.m_type == RowCollection::COLLECTION_LIST)
4080   {
4081     NullRowRef.copyto_link(saved_dstptr); // Null terminate list...
4082     add_to_list(collection.m_list, ref);
4083   }
4084   else
4085   {
4086     Uint32 error = add_to_map(collection.m_map, row.m_src_correlation, ref);
4087     if (unlikely(error))
4088       return error;
4089   }
4090 
4091   return 0;
4092 }
4093 
4094 void
setupRowPtr(Ptr<TreeNode> treeNodePtr,RowPtr & row,RowRef ref,const Uint32 * src)4095 Dbspj::setupRowPtr(Ptr<TreeNode> treeNodePtr,
4096                    RowPtr& row, RowRef ref, const Uint32 * src)
4097 {
4098   ndbassert(src != NULL);
4099   const Uint32 offset = treeNodePtr.p->m_rows.rowOffset();
4100   const Uint32 matchlen =
4101      (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MATCH) ? 1 : 0;
4102   const RowPtr::Header * headptr = (RowPtr::Header*)(src + offset + matchlen);
4103   const Uint32 headlen = 1 + headptr->m_len;
4104 
4105   // Setup row, containing either entire row or only the correlationId.
4106   row.m_type = RowPtr::RT_LINEAR;
4107   row.m_row_data.m_linear.m_row_ref = ref;
4108   row.m_row_data.m_linear.m_header = headptr;
4109   row.m_row_data.m_linear.m_data = (Uint32*)headptr + headlen;
4110 
4111   if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MATCH)
4112   {
4113     row.m_matched = (TreeNodeBitMask*)(src + offset);
4114   }
4115   else
4116   {
4117     row.m_matched = NULL;
4118   }
4119 }
4120 
4121 void
add_to_list(SLFifoRowList & list,RowRef rowref)4122 Dbspj::add_to_list(SLFifoRowList & list, RowRef rowref)
4123 {
4124   if (list.isNull())
4125   {
4126     jam();
4127     list.m_first_row_page_id = rowref.m_page_id;
4128     list.m_first_row_page_pos = rowref.m_page_pos;
4129   }
4130   else
4131   {
4132     jam();
4133     /**
4134      * add last to list
4135      */
4136     RowRef last;
4137     last.m_alloc_type = rowref.m_alloc_type;
4138     last.m_page_id = list.m_last_row_page_id;
4139     last.m_page_pos = list.m_last_row_page_pos;
4140     Uint32 * const rowptr = get_row_ptr(last);
4141     rowref.copyto_link(rowptr);
4142   }
4143 
4144   list.m_last_row_page_id = rowref.m_page_id;
4145   list.m_last_row_page_pos = rowref.m_page_pos;
4146 }
4147 
4148 Uint32 *
get_row_ptr(RowRef pos)4149 Dbspj::get_row_ptr(RowRef pos)
4150 {
4151   Ptr<RowPage> ptr;
4152   m_page_pool.getPtr(ptr, pos.m_page_id);
4153   if (pos.m_alloc_type == BUFFER_STACK) // ::stackAlloc() memory
4154   {
4155     jam();
4156     return ptr.p->m_data + pos.m_page_pos;
4157   }
4158   else                                 // ::varAlloc() memory
4159   {
4160     jam();
4161     ndbassert(pos.m_alloc_type == BUFFER_VAR);
4162     return ((Var_page*)ptr.p)->get_ptr(pos.m_page_pos);
4163   }
4164 }
4165 
4166 inline
4167 bool
first(const SLFifoRowList & list,SLFifoRowListIterator & iter)4168 Dbspj::first(const SLFifoRowList& list,
4169              SLFifoRowListIterator& iter)
4170 {
4171   if (list.isNull())
4172   {
4173     jam();
4174     iter.setNull();
4175     return false;
4176   }
4177 
4178   //  const Buffer_type allocator = list.m_rowBuffer->m_type;
4179   iter.m_ref.m_alloc_type = list.m_rowBuffer->m_type;
4180   iter.m_ref.m_page_id = list.m_first_row_page_id;
4181   iter.m_ref.m_page_pos = list.m_first_row_page_pos;
4182   iter.m_row_ptr = get_row_ptr(iter.m_ref);
4183   return true;
4184 }
4185 
4186 inline
4187 bool
next(SLFifoRowListIterator & iter)4188 Dbspj::next(SLFifoRowListIterator& iter)
4189 {
4190   iter.m_ref.assign_from_link(iter.m_row_ptr);
4191   if (iter.m_ref.isNull())
4192   {
4193     jam();
4194     return false;
4195   }
4196   iter.m_row_ptr = get_row_ptr(iter.m_ref);
4197   return true;
4198 }
4199 
4200 Uint32
add_to_map(RowMap & map,Uint32 corrVal,RowRef rowref)4201 Dbspj::add_to_map(RowMap& map,
4202                   Uint32 corrVal, RowRef rowref)
4203 {
4204   Uint32 * mapptr;
4205   if (unlikely(map.isNull()))
4206   {
4207     jam();
4208     ndbassert(map.m_size > 0);
4209     ndbassert(map.m_rowBuffer != NULL);
4210 
4211     Uint32 sz16 = RowMap::MAP_SIZE_PER_REF_16 * map.m_size;
4212     Uint32 sz32 = (sz16 + 1) / 2;
4213     RowRef ref;
4214     mapptr = rowAlloc(*map.m_rowBuffer, ref, sz32);
4215     if (unlikely(mapptr == 0))
4216     {
4217       jam();
4218       return DbspjErr::OutOfRowMemory;
4219     }
4220     map.assign(ref);
4221     map.m_elements = 0;
4222     map.clear(mapptr);
4223   }
4224   else
4225   {
4226     jam();
4227     RowRef ref;
4228     map.copyto(ref);
4229     mapptr = get_row_ptr(ref);
4230   }
4231 
4232   Uint32 pos = corrVal & 0xFFFF;
4233   ndbrequire(pos < map.m_size);
4234   ndbrequire(map.m_elements < map.m_size);
4235 
4236   if (1)
4237   {
4238     /**
4239      * Check that *pos* is empty
4240      */
4241     RowRef check;
4242     map.load(mapptr, pos, check);
4243     ndbrequire(check.m_page_pos == 0xFFFF);
4244   }
4245 
4246   map.store(mapptr, pos, rowref);
4247 
4248   return 0;
4249 }
4250 
4251 inline
4252 bool
first(const RowMap & map,RowMapIterator & iter)4253 Dbspj::first(const RowMap& map,
4254              RowMapIterator & iter)
4255 {
4256   if (map.isNull())
4257   {
4258     jam();
4259     iter.setNull();
4260     return false;
4261   }
4262 
4263   iter.m_map_ptr = get_row_ptr(map.m_map_ref);
4264   iter.m_size = map.m_size;
4265   iter.m_ref.m_alloc_type = map.m_rowBuffer->m_type;
4266 
4267   Uint32 pos = 0;
4268   while (RowMap::isNull(iter.m_map_ptr, pos) && pos < iter.m_size)
4269     pos++;
4270 
4271   if (pos == iter.m_size)
4272   {
4273     jam();
4274     iter.setNull();
4275     return false;
4276   }
4277   else
4278   {
4279     jam();
4280     RowMap::load(iter.m_map_ptr, pos, iter.m_ref);
4281     iter.m_element_no = pos;
4282     iter.m_row_ptr = get_row_ptr(iter.m_ref);
4283     return true;
4284   }
4285 }
4286 
4287 inline
4288 bool
next(RowMapIterator & iter)4289 Dbspj::next(RowMapIterator & iter)
4290 {
4291   Uint32 pos = iter.m_element_no + 1;
4292   while (RowMap::isNull(iter.m_map_ptr, pos) && pos < iter.m_size)
4293     pos++;
4294 
4295   if (pos == iter.m_size)
4296   {
4297     jam();
4298     iter.setNull();
4299     return false;
4300   }
4301   else
4302   {
4303     jam();
4304     RowMap::load(iter.m_map_ptr, pos, iter.m_ref);
4305     iter.m_element_no = pos;
4306     iter.m_row_ptr = get_row_ptr(iter.m_ref);
4307     return true;
4308   }
4309 }
4310 
4311 bool
first(const RowCollection & collection,RowIterator & iter)4312 Dbspj::first(const RowCollection& collection,
4313              RowIterator& iter)
4314 {
4315   iter.m_type = collection.m_type;
4316   if (iter.m_type == RowCollection::COLLECTION_LIST)
4317   {
4318     jam();
4319     return first(collection.m_list, iter.m_list);
4320   }
4321   else
4322   {
4323     jam();
4324     ndbassert(iter.m_type == RowCollection::COLLECTION_MAP);
4325     return first(collection.m_map, iter.m_map);
4326   }
4327 }
4328 
4329 bool
next(RowIterator & iter)4330 Dbspj::next(RowIterator& iter)
4331 {
4332   if (iter.m_type == RowCollection::COLLECTION_LIST)
4333   {
4334     jam();
4335     return next(iter.m_list);
4336   }
4337   else
4338   {
4339     jam();
4340     ndbassert(iter.m_type == RowCollection::COLLECTION_MAP);
4341     return next(iter.m_map);
4342   }
4343 }
4344 
4345 inline
4346 Uint32 *
stackAlloc(RowBuffer & buffer,RowRef & dst,Uint32 sz)4347 Dbspj::stackAlloc(RowBuffer & buffer, RowRef& dst, Uint32 sz)
4348 {
4349   Ptr<RowPage> ptr;
4350   Local_RowPage_fifo list(m_page_pool, buffer.m_page_list);
4351 
4352   Uint32 pos = buffer.m_stack.m_pos;
4353   const Uint32 SIZE = RowPage::SIZE;
4354   if (list.isEmpty() || (pos + sz) > SIZE)
4355   {
4356     jam();
4357     bool ret = allocPage(ptr);
4358     if (unlikely(ret == false))
4359     {
4360       jam();
4361       return 0;
4362     }
4363 
4364     pos = 0;
4365     list.addLast(ptr);
4366   }
4367   else
4368   {
4369     list.last(ptr);
4370   }
4371 
4372   dst.m_page_id = ptr.i;
4373   dst.m_page_pos = pos;
4374   dst.m_alloc_type = BUFFER_STACK;
4375   buffer.m_stack.m_pos = pos + sz;
4376   return ptr.p->m_data + pos;
4377 }
4378 
4379 inline
4380 Uint32 *
varAlloc(RowBuffer & buffer,RowRef & dst,Uint32 sz)4381 Dbspj::varAlloc(RowBuffer & buffer, RowRef& dst, Uint32 sz)
4382 {
4383   Ptr<RowPage> ptr;
4384   Local_RowPage_fifo list(m_page_pool, buffer.m_page_list);
4385 
4386   Uint32 free_space = buffer.m_var.m_free;
4387   if (list.isEmpty() || free_space < (sz + 1))
4388   {
4389     jam();
4390     bool ret = allocPage(ptr);
4391     if (unlikely(ret == false))
4392     {
4393       jam();
4394       return 0;
4395     }
4396 
4397     list.addLast(ptr);
4398     ((Var_page*)ptr.p)->init();
4399   }
4400   else
4401   {
4402     jam();
4403     list.last(ptr);
4404   }
4405 
4406   Var_page * vp = (Var_page*)ptr.p;
4407   Uint32 pos = vp->alloc_record(sz, (Var_page*)m_buffer0, Var_page::CHAIN);
4408 
4409   dst.m_page_id = ptr.i;
4410   dst.m_page_pos = pos;
4411   dst.m_alloc_type = BUFFER_VAR;
4412   buffer.m_var.m_free = vp->free_space;
4413   return vp->get_ptr(pos);
4414 }
4415 
4416 Uint32 *
rowAlloc(RowBuffer & rowBuffer,RowRef & dst,Uint32 sz)4417 Dbspj::rowAlloc(RowBuffer& rowBuffer, RowRef& dst, Uint32 sz)
4418 {
4419   if (rowBuffer.m_type == BUFFER_STACK)
4420   {
4421     jam();
4422     return stackAlloc(rowBuffer, dst, sz);
4423   }
4424   else if (rowBuffer.m_type == BUFFER_VAR)
4425   {
4426     jam();
4427     return varAlloc(rowBuffer, dst, sz);
4428   }
4429   else
4430   {
4431     jam();
4432     ndbabort();
4433     return NULL;
4434   }
4435 }
4436 
4437 bool
allocPage(Ptr<RowPage> & ptr)4438 Dbspj::allocPage(Ptr<RowPage> & ptr)
4439 {
4440   if (m_free_page_list.isEmpty())
4441   {
4442     jam();
4443     if (ERROR_INSERTED_CLEAR(17003))
4444     {
4445       jam();
4446       ndbout_c("Injecting failed '::allocPage', error 17003 at line %d file %s",
4447                __LINE__,  __FILE__);
4448       return false;
4449     }
4450     ptr.p = (RowPage*)m_ctx.m_mm.alloc_page(RT_SPJ_DATABUFFER,
4451                                             &ptr.i,
4452                                             Ndbd_mem_manager::NDB_ZONE_LE_32);
4453     if (ptr.p == 0)
4454     {
4455       jam();
4456       return false;
4457     }
4458     return true;
4459   }
4460   else
4461   {
4462     jam();
4463     Local_RowPage_list list(m_page_pool, m_free_page_list);
4464     bool ret = list.removeFirst(ptr);
4465     ndbrequire(ret);
4466     return ret;
4467   }
4468 }
4469 
4470 void
releasePage(Ptr<RowPage> ptr)4471 Dbspj::releasePage(Ptr<RowPage> ptr)
4472 {
4473   Local_RowPage_list list(m_page_pool, m_free_page_list);
4474   list.addFirst(ptr);
4475 }
4476 
4477 void
releaseGlobal(Signal * signal)4478 Dbspj::releaseGlobal(Signal * signal)
4479 {
4480   Uint32 delay = 100;
4481   Local_RowPage_list list(m_page_pool, m_free_page_list);
4482   if (list.isEmpty())
4483   {
4484     jam();
4485     delay = 300;
4486   }
4487   else
4488   {
4489     Ptr<RowPage> ptr;
4490     list.removeFirst(ptr);
4491     m_ctx.m_mm.release_page(RT_SPJ_DATABUFFER, ptr.i);
4492   }
4493 
4494   signal->theData[0] = 0;
4495   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, delay, 1);
4496 }
4497 
4498 Uint32
checkTableError(Ptr<TreeNode> treeNodePtr) const4499 Dbspj::checkTableError(Ptr<TreeNode> treeNodePtr) const
4500 {
4501   jam();
4502   if (treeNodePtr.p->m_tableOrIndexId >= c_tabrecFilesize)
4503   {
4504     jam();
4505     ndbassert(c_tabrecFilesize > 0);
4506     return DbspjErr::NoSuchTable;
4507   }
4508 
4509   TableRecordPtr tablePtr;
4510   tablePtr.i = treeNodePtr.p->m_tableOrIndexId;
4511   ptrAss(tablePtr, m_tableRecord);
4512   Uint32 err = tablePtr.p->checkTableError(treeNodePtr.p->m_schemaVersion);
4513   if (unlikely(err))
4514   {
4515     DEBUG_DICT("Dbsp::checkTableError"
4516               << ", m_node_no: " << treeNodePtr.p->m_node_no
4517               << ", tableOrIndexId: " << treeNodePtr.p->m_tableOrIndexId
4518               << ", error: " << err);
4519   }
4520   if (ERROR_INSERTED(17520) ||
4521       (ERROR_INSERTED(17521) && (rand() % 7) == 0))
4522   {
4523     jam();
4524     CLEAR_ERROR_INSERT_VALUE;
4525     ndbout_c("::checkTableError, injecting NoSuchTable error at line %d file %s",
4526               __LINE__,  __FILE__);
4527     return DbspjErr::NoSuchTable;
4528   }
4529   return err;
4530 }
4531 
4532 void
dumpScanFragHandle(Ptr<ScanFragHandle> fragPtr) const4533 Dbspj::dumpScanFragHandle(Ptr<ScanFragHandle> fragPtr) const
4534 {
4535   jam();
4536 
4537   g_eventLogger->info("DBSPJ %u :         SFH fragid %u state %u ref 0x%x "
4538                       "rangePtr 0x%x",
4539                       instance(),
4540                       fragPtr.p->m_fragId,
4541                       fragPtr.p->m_state,
4542                       fragPtr.p->m_ref,
4543                       fragPtr.p->m_rangePtrI);
4544 }
4545 
4546 
4547 void
dumpNodeCommon(const Ptr<TreeNode> treeNodePtr) const4548 Dbspj::dumpNodeCommon(const Ptr<TreeNode> treeNodePtr) const
4549 {
4550   jam();
4551 
4552   g_eventLogger->info("DBSPJ %u :     TreeNode (%u) (0x%x:%p) state %u bits 0x%x "
4553                       "tableid %u schVer 0x%x",
4554                       instance(),
4555                       treeNodePtr.p->m_node_no,
4556                       treeNodePtr.i,
4557                       treeNodePtr.p,
4558                       treeNodePtr.p->m_state,
4559                       treeNodePtr.p->m_bits,
4560                       treeNodePtr.p->m_tableOrIndexId,
4561                       treeNodePtr.p->m_schemaVersion);
4562   g_eventLogger->info("DBSPJ %u :     TreeNode (%u) ptableId %u ref 0x%x "
4563                       "correlation %u parentPtrI 0x%x",
4564                       instance(),
4565                       treeNodePtr.p->m_node_no,
4566                       treeNodePtr.p->m_primaryTableId,
4567                       treeNodePtr.p->m_send.m_ref,
4568                       treeNodePtr.p->m_send.m_correlation,
4569                       treeNodePtr.p->m_parentPtrI);
4570 
4571 }
4572 
4573 void
dumpRequest(const char * reason,const Ptr<Request> requestPtr)4574 Dbspj::dumpRequest(const char* reason,
4575                    const Ptr<Request> requestPtr)
4576 {
4577   jam();
4578 
4579   /* TODO Add to DUMP_STATE_ORD */
4580 
4581   g_eventLogger->info("DBSPJ %u : Dumping request (0x%x:%p) due to %s.",
4582                       instance(),
4583                       requestPtr.i,
4584                       requestPtr.p,
4585                       reason);
4586 
4587   g_eventLogger->info("DBSPJ %u :   Request state %u bits 0x%x errCode %u "
4588                       "senderRef 0x%x rootFragId %u",
4589                       instance(),
4590                       requestPtr.p->m_state,
4591                       requestPtr.p->m_bits,
4592                       requestPtr.p->m_errCode,
4593                       requestPtr.p->m_senderRef,
4594                       requestPtr.p->m_rootFragId);
4595 
4596   g_eventLogger->info("DBSPJ %u :   Request transid (0x%x 0x%x) node_cnt %u "
4597                       "active_cnt %u m_outstanding %u",
4598                       instance(),
4599                       requestPtr.p->m_transId[0],
4600                       requestPtr.p->m_transId[1],
4601                       requestPtr.p->m_node_cnt,
4602                       requestPtr.p->m_cnt_active,
4603                       requestPtr.p->m_outstanding);
4604 
4605   /* Iterate over request's nodes */
4606   {
4607     Ptr<TreeNode> treeNodePtr;
4608     Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
4609     for (list.first(treeNodePtr);
4610          !treeNodePtr.isNull();
4611          list.next(treeNodePtr))
4612     {
4613       jam();
4614       ndbrequire(treeNodePtr.p->m_info != NULL);
4615 
4616       dumpNodeCommon(treeNodePtr);
4617 
4618       if (treeNodePtr.p->m_info->m_dumpNode != NULL)
4619       {
4620         jam();
4621         (this->*(treeNodePtr.p->m_info->m_dumpNode))
4622           (requestPtr, treeNodePtr);
4623       }
4624     }
4625   }
4626 
4627   g_eventLogger->info("DBSPJ %u : Finished dumping request (%u:%p)",
4628                       instance(),
4629                       requestPtr.i,
4630                       requestPtr.p);
4631 }
4632 
getBufferedRow(const Ptr<TreeNode> treeNodePtr,Uint32 rowId,RowPtr * row)4633 void Dbspj::getBufferedRow(const Ptr<TreeNode> treeNodePtr, Uint32 rowId,
4634                            RowPtr *row)
4635 {
4636   DEBUG("getBufferedRow, node no: " << treeNodePtr.p->m_node_no
4637 	<< ", rowId: " << rowId);
4638   ndbassert(treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ANY);
4639 
4640   // Set up RowPtr & RowRef for this parent row
4641   RowRef ref;
4642   ndbassert(treeNodePtr.p->m_rows.m_type == RowCollection::COLLECTION_MAP);
4643   treeNodePtr.p->m_rows.m_map.copyto(ref);
4644   const Uint32* const mapptr = get_row_ptr(ref);
4645 
4646   // Relocate parent row from correlation value.
4647   treeNodePtr.p->m_rows.m_map.load(mapptr, rowId, ref);
4648   const Uint32* const rowptr = get_row_ptr(ref);
4649 
4650   RowPtr _row;
4651   _row.m_src_node_ptrI = treeNodePtr.i;
4652   setupRowPtr(treeNodePtr, _row, ref, rowptr);
4653 
4654   getCorrelationData(_row.m_row_data.m_linear,
4655                      _row.m_row_data.m_linear.m_header->m_len - 1,
4656                      _row.m_src_correlation);
4657   *row = _row;
4658 }
4659 
4660 /**
4661  * resumeBufferedNode() -  Resume the execution from the specified TreeNode
4662  *
4663  * All preceeding node which we depends on, has completed their
4664  * batches. The returned result rows from our parent node has
4665  * been buffered, and the match-bitmap in our scanAncestor(s)
4666  * are set up.
4667  *
4668  * Iterate through all our buffered parent result rows, check their
4669  * 'match' vs the dependencies, and submit request for the
4670  * qualifying rows.
4671  */
4672 void
resumeBufferedNode(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)4673 Dbspj::resumeBufferedNode(Signal* signal,
4674                           Ptr<Request> requestPtr,
4675                           Ptr<TreeNode> treeNodePtr)
4676 {
4677   Ptr<TreeNode> parentPtr;
4678   m_treenode_pool.getPtr(parentPtr, treeNodePtr.p->m_parentPtrI);
4679   ndbassert(treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_NODE);
4680   ndbassert(parentPtr.p->m_bits & TreeNode::T_BUFFER_ROW);
4681 
4682   int total = 0, skipped = 0;
4683   RowIterator iter;
4684   for (first(parentPtr.p->m_rows, iter); !iter.isNull(); next(iter))
4685   {
4686     RowPtr parentRow;
4687     jam();
4688     total++;
4689 
4690     parentRow.m_src_node_ptrI = treeNodePtr.p->m_parentPtrI;
4691     setupRowPtr(parentPtr, parentRow,
4692                 iter.m_base.m_ref, iter.m_base.m_row_ptr);
4693 
4694     getCorrelationData(parentRow.m_row_data.m_linear,
4695                        parentRow.m_row_data.m_linear.m_header->m_len - 1,
4696                        parentRow.m_src_correlation);
4697 
4698     // Need to consult the Scan-ancestor(s) to determine if
4699     // INNER_JOIN matches were found for all of our predecessors
4700     Ptr<TreeNode> scanAncestorPtr(parentPtr);
4701     RowPtr scanAncestorRow(parentRow);
4702     if (treeNodePtr.p->m_parentPtrI != treeNodePtr.p->m_scanAncestorPtrI)
4703     {
4704       jam();
4705       m_treenode_pool.getPtr(scanAncestorPtr, treeNodePtr.p->m_scanAncestorPtrI);
4706       getBufferedRow(scanAncestorPtr, (parentRow.m_src_correlation >> 16),
4707                      &scanAncestorRow);
4708     }
4709 
4710     while (true)
4711     {
4712       TreeNodeBitMask required_matches(treeNodePtr.p->m_dependencies);
4713       required_matches.bitAND(scanAncestorPtr.p->m_coverage);
4714 
4715       if (!scanAncestorRow.m_matched->contains(required_matches))
4716       {
4717         DEBUG("parentRow-join SKIPPED");
4718         skipped++;
4719 	break;
4720       }
4721 
4722       if (scanAncestorPtr.p->m_coverage.contains(treeNodePtr.p->m_dependencies))
4723       {
4724         jam();
4725 	goto row_accepted;
4726       }
4727 
4728       // Has to consult grand-ancestors to verify their matches.
4729       m_treenode_pool.getPtr(scanAncestorPtr, scanAncestorPtr.p->m_scanAncestorPtrI);
4730 
4731       if ((scanAncestorPtr.p->m_bits & TreeNode::T_BUFFER_MATCH) == 0)
4732       {
4733         jam();
4734 	goto row_accepted;
4735       }
4736 
4737       getBufferedRow(scanAncestorPtr, (scanAncestorRow.m_src_correlation >> 16),
4738                      &scanAncestorRow);
4739     }
4740     continue;  //Row skipped, didn't 'match' dependent INNER-join -> next row
4741 
4742 row_accepted:
4743     ndbassert(treeNodePtr.p->m_info != NULL);
4744     ndbassert(treeNodePtr.p->m_info->m_parent_row != NULL);
4745     (this->*(treeNodePtr.p->m_info->m_parent_row))(signal, requestPtr, treeNodePtr, parentRow);
4746   }
4747 
4748   DEBUG("resumeBufferedNode: #buffered rows: " << total << ", skipped: " << skipped);
4749 }
4750 
4751 /**
4752  * END - MODULE GENERIC
4753  */
4754 
4755 void
common_execTRANSID_AI(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const RowPtr & rowRef)4756 Dbspj::common_execTRANSID_AI(Signal* signal,
4757                              Ptr<Request> requestPtr,
4758                              Ptr<TreeNode> treeNodePtr,
4759                              const RowPtr & rowRef)
4760 {
4761   if (likely((requestPtr.p->m_state & Request::RS_ABORTING) == 0))
4762   {
4763     // Set 'matched' bit in previous scan ancestors
4764     if ((requestPtr.p->m_bits & Request::RT_MULTI_SCAN) != 0)
4765     {
4766       RowPtr scanAncestorRow(rowRef);
4767       Uint32 scanAncestorPtrI = treeNodePtr.p->m_scanAncestorPtrI;
4768       while (scanAncestorPtrI != RNIL)  // or 'break' below
4769       {
4770         jam();
4771         Ptr<TreeNode> scanAncestorPtr;
4772         m_treenode_pool.getPtr(scanAncestorPtr, scanAncestorPtrI);
4773         if ((scanAncestorPtr.p->m_bits & TreeNode::T_BUFFER_MATCH) == 0)
4774         {
4775           jam();
4776           break;
4777         }
4778 
4779         getBufferedRow(scanAncestorPtr, (scanAncestorRow.m_src_correlation >> 16),
4780                        &scanAncestorRow);
4781 
4782         if (scanAncestorRow.m_matched->get(treeNodePtr.p->m_node_no))
4783         {
4784           jam();
4785           break;
4786         }
4787         scanAncestorRow.m_matched->set(treeNodePtr.p->m_node_no);
4788         scanAncestorPtrI = scanAncestorPtr.p->m_scanAncestorPtrI;
4789       } //while
4790     } //RT_MULTI_SCAN
4791 
4792     LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
4793     Local_dependency_map nextExec(pool, treeNodePtr.p->m_next_nodes);
4794     Dependency_map::ConstDataBufferIterator it;
4795 
4796     /**
4797      * Activate 'next' operations in two steps:
4798      * 1) Any child operations being 'ENQUEUED' are prepared
4799      *    for later resumed exec by appending rowRefs to the deferred
4800      *    list.
4801      * 2) Start immediate executing non-ENQUEUED child operations.
4802      */
4803     for (nextExec.first(it); !it.isNull(); nextExec.next(it))
4804     {
4805       Ptr<TreeNode> nextTreeNodePtr;
4806       m_treenode_pool.getPtr(nextTreeNodePtr, * it.data);
4807 
4808       if (nextTreeNodePtr.p->m_resumeEvents & TreeNode::TN_ENQUEUE_OP)
4809       {
4810         jam();
4811         DEBUG("ENQUEUE row for deferred TreeNode: " << nextTreeNodePtr.p->m_node_no);
4812 
4813 	/**
4814          * 'rowRef' is the ancestor row from the immediate ancestor in
4815          * the execution plan. In case this is different from the parent-treeNode
4816          * in the 'query', we have to find the 'real' parentRow from the
4817          * parent as defined in the 'query'
4818          */
4819         RowPtr parentRow(rowRef);
4820         if (nextTreeNodePtr.p->m_parentPtrI != treeNodePtr.i)
4821         {
4822           Ptr<TreeNode> parentPtr;
4823           const Uint32 parentRowId = (parentRow.m_src_correlation >> 16);
4824           m_treenode_pool.getPtr(parentPtr, nextTreeNodePtr.p->m_parentPtrI);
4825           getBufferedRow(parentPtr, parentRowId, &parentRow);
4826         }
4827 
4828         /**
4829          * Append correlation values of deferred operations
4830          * to a list / fifo. Upon resume, we will then be able to
4831          * relocate all BUFFER'ed parent rows for which to resume operations.
4832          */
4833         bool appended;
4834         {
4835           // Need an own scope for correlation_list, as ::lookup_abort() will also
4836           // construct such a list. Such nested usage is not allowed.
4837           LocalArenaPool<DataBufferSegment<14> > pool(nextTreeNodePtr.p->m_batchArena, m_dependency_map_pool);
4838           Local_correlation_list correlations(pool, nextTreeNodePtr.p->m_deferred.m_correlations);
4839           appended = correlations.append(&parentRow.m_src_correlation, 1);
4840         }
4841         if (unlikely(!appended))
4842         {
4843           jam();
4844           abort(signal, requestPtr, DbspjErr::OutOfQueryMemory);
4845           return;
4846         }
4847 
4848         // As there are pending deferred operations we are not complete
4849         requestPtr.p->m_completed_tree_nodes.clear(nextTreeNodePtr.p->m_node_no);
4850       } //TN_ENQUEUE_OP
4851     }
4852 
4853     for (nextExec.first(it); !it.isNull(); nextExec.next(it))
4854     {
4855       Ptr<TreeNode> nextTreeNodePtr;
4856       m_treenode_pool.getPtr(nextTreeNodePtr, * it.data);
4857 
4858       /**
4859        * Execution of 'next' TreeNode may have to be delayed. Will be resumed
4860        * later, either by lookup_resume() or resumeBufferedNode()
4861        */
4862       static const Uint32 delayExec = TreeNode::TN_ENQUEUE_OP
4863 	                            | TreeNode::TN_EXEC_WAIT;
4864 
4865       if ((nextTreeNodePtr.p->m_resumeEvents & delayExec) == 0)
4866       {
4867         jam();
4868 
4869 	/**
4870          * 'rowRef' is the ancestor row from the immediate ancestor in
4871          * the execution plan. In case this is different from the parent-treeNode
4872          * in the 'query', we have to find the 'real' parentRow from the
4873          * parent as defined in the 'query'
4874          */
4875         RowPtr parentRow(rowRef);
4876         if (nextTreeNodePtr.p->m_parentPtrI != treeNodePtr.i)
4877         {
4878           Ptr<TreeNode> parentPtr;
4879           const Uint32 parentRowId = (parentRow.m_src_correlation >> 16);
4880           m_treenode_pool.getPtr(parentPtr, nextTreeNodePtr.p->m_parentPtrI);
4881           getBufferedRow(parentPtr, parentRowId, &parentRow);
4882         }
4883 
4884         ndbassert(nextTreeNodePtr.p->m_info != NULL);
4885         ndbassert(nextTreeNodePtr.p->m_info->m_parent_row != NULL);
4886 
4887         (this->*(nextTreeNodePtr.p->m_info->m_parent_row))(signal,
4888                                                     requestPtr, nextTreeNodePtr, parentRow);
4889 
4890         /* Recheck RS_ABORTING as 'next' operation might have aborted */
4891         if (unlikely(requestPtr.p->m_state & Request::RS_ABORTING))
4892         {
4893           jam();
4894           return;
4895         }
4896       }
4897     }
4898   }
4899 }
4900 
4901 
4902 /**
4903  * MODULE LOOKUP
4904  */
4905 const Dbspj::OpInfo
4906 Dbspj::g_LookupOpInfo =
4907 {
4908   &Dbspj::lookup_build,
4909   0, // prepare
4910   &Dbspj::lookup_start,
4911   &Dbspj::lookup_countSignal,
4912   &Dbspj::lookup_execLQHKEYREF,
4913   &Dbspj::lookup_execLQHKEYCONF,
4914   0, // execSCAN_FRAGREF
4915   0, // execSCAN_FRAGCONF
4916   &Dbspj::lookup_parent_row,
4917   0, // Dbspj::lookup_parent_batch_complete,
4918   0, // Dbspj::lookup_parent_batch_repeat,
4919   0, // Dbspj::lookup_parent_batch_cleanup,
4920   0, // Dbspj::lookup_execSCAN_NEXTREQ
4921   0, // Dbspj::lookup_complete
4922   &Dbspj::lookup_abort,
4923   &Dbspj::lookup_execNODE_FAILREP,
4924   &Dbspj::lookup_cleanup,
4925   &Dbspj::lookup_checkNode,
4926   &Dbspj::lookup_dumpNode
4927 };
4928 
4929 Uint32
lookup_build(Build_context & ctx,Ptr<Request> requestPtr,const QueryNode * qn,const QueryNodeParameters * qp)4930 Dbspj::lookup_build(Build_context& ctx,
4931                     Ptr<Request> requestPtr,
4932                     const QueryNode* qn,
4933                     const QueryNodeParameters* qp)
4934 {
4935   Uint32 err = 0;
4936   Ptr<TreeNode> treeNodePtr;
4937   const QN_LookupNode * node = (const QN_LookupNode*)qn;
4938   const QN_LookupParameters * param = (const QN_LookupParameters*)qp;
4939   do
4940   {
4941     jam();
4942     err = DbspjErr::InvalidTreeNodeSpecification;
4943     if (unlikely(node->len < QN_LookupNode::NodeSize))
4944     {
4945       jam();
4946       break;
4947     }
4948 
4949     err = DbspjErr::InvalidTreeParametersSpecification;
4950     DEBUG("param len: " << param->len);
4951     if (unlikely(param->len < QN_LookupParameters::NodeSize))
4952     {
4953       jam();
4954       break;
4955     }
4956 
4957     err = createNode(ctx, requestPtr, treeNodePtr);
4958     if (unlikely(err != 0))
4959     {
4960       jam();
4961       break;
4962     }
4963 
4964     treeNodePtr.p->m_tableOrIndexId = node->tableId;
4965     treeNodePtr.p->m_primaryTableId = node->tableId;
4966     treeNodePtr.p->m_schemaVersion = node->tableVersion;
4967     treeNodePtr.p->m_info = &g_LookupOpInfo;
4968     Uint32 transId1 = requestPtr.p->m_transId[0];
4969     Uint32 transId2 = requestPtr.p->m_transId[1];
4970     Uint32 savePointId = ctx.m_savepointId;
4971 
4972     Uint32 treeBits = node->requestInfo;
4973     Uint32 paramBits = param->requestInfo;
4974     //ndbout_c("Dbspj::lookup_build() treeBits=%.8x paramBits=%.8x",
4975     //         treeBits, paramBits);
4976     LqhKeyReq* dst = (LqhKeyReq*)treeNodePtr.p->m_lookup_data.m_lqhKeyReq;
4977     {
4978       /**
4979        * static variables
4980        */
4981       dst->tcBlockref = reference();
4982       dst->clientConnectPtr = treeNodePtr.i;
4983 
4984       /**
4985        * TODO reference()+treeNodePtr.i is passed twice
4986        *   this can likely be optimized using the requestInfo-bits
4987        * UPDATE: This can be accomplished by *not* setApplicationAddressFlag
4988        *         and patch LQH to then instead use tcBlockref/clientConnectPtr
4989        */
4990       dst->transId1 = transId1;
4991       dst->transId2 = transId2;
4992       dst->savePointId = savePointId;
4993       dst->scanInfo = 0;
4994       dst->attrLen = 0;
4995       /** Initialy set reply ref to client, do_send will set SPJ refs if non-LEAF */
4996       dst->variableData[0] = ctx.m_resultRef;
4997       dst->variableData[1] = param->resultData;
4998       Uint32 requestInfo = 0;
4999       LqhKeyReq::setOperation(requestInfo, ZREAD);
5000       LqhKeyReq::setApplicationAddressFlag(requestInfo, 1);
5001       LqhKeyReq::setDirtyFlag(requestInfo, 1);
5002       LqhKeyReq::setSimpleFlag(requestInfo, 1);
5003       LqhKeyReq::setNormalProtocolFlag(requestInfo, 0);  // Assume T_LEAF
5004       LqhKeyReq::setCorrFactorFlag(requestInfo, 1);
5005       LqhKeyReq::setNoDiskFlag(requestInfo,
5006                                (treeBits & DABits::NI_LINKED_DISK) == 0 &&
5007                                (paramBits & DABits::PI_DISK_ATTR) == 0);
5008 
5009       // FirstMatch in a lookup request can just be ignored
5010       //if (treeBits & DABits::NI_FIRST_MATCH)
5011       //{}
5012 
5013       dst->requestInfo = requestInfo;
5014     }
5015 
5016     if (treeBits & QN_LookupNode::L_UNIQUE_INDEX)
5017     {
5018       jam();
5019       treeNodePtr.p->m_bits |= TreeNode::T_UNIQUE_INDEX_LOOKUP;
5020     }
5021 
5022     Uint32 tableId = node->tableId;
5023     Uint32 schemaVersion = node->tableVersion;
5024 
5025     Uint32 tableSchemaVersion = tableId + ((schemaVersion << 16) & 0xFFFF0000);
5026     dst->tableSchemaVersion = tableSchemaVersion;
5027 
5028     ctx.m_resultData = param->resultData;
5029     treeNodePtr.p->m_lookup_data.m_api_resultRef = ctx.m_resultRef;
5030     treeNodePtr.p->m_lookup_data.m_api_resultData = param->resultData;
5031     treeNodePtr.p->m_lookup_data.m_outstanding = 0;
5032 
5033     /**
5034      * Parse stuff common lookup/scan-frag
5035      */
5036     struct DABuffer nodeDA, paramDA;
5037     nodeDA.ptr = node->optional;
5038     nodeDA.end = nodeDA.ptr + (node->len - QN_LookupNode::NodeSize);
5039     paramDA.ptr = param->optional;
5040     paramDA.end = paramDA.ptr + (param->len - QN_LookupParameters::NodeSize);
5041     err = parseDA(ctx, requestPtr, treeNodePtr,
5042                   nodeDA, treeBits, paramDA, paramBits);
5043     if (unlikely(err != 0))
5044     {
5045       jam();
5046       break;
5047     }
5048 
5049     if (treeNodePtr.p->m_bits & TreeNode::T_ATTR_INTERPRETED)
5050     {
5051       jam();
5052       LqhKeyReq::setInterpretedFlag(dst->requestInfo, 1);
5053     }
5054 
5055     /**
5056      * Inherit batch size from parent
5057      */
5058     treeNodePtr.p->m_batch_size = 1;
5059     if (treeNodePtr.p->m_parentPtrI != RNIL)
5060     {
5061       jam();
5062       Ptr<TreeNode> parentPtr;
5063       m_treenode_pool.getPtr(parentPtr, treeNodePtr.p->m_parentPtrI);
5064       treeNodePtr.p->m_batch_size = parentPtr.p->m_batch_size;
5065     }
5066 
5067     if (ctx.m_start_signal)
5068     {
5069       jam();
5070       Signal * signal = ctx.m_start_signal;
5071       const LqhKeyReq* src = (const LqhKeyReq*)signal->getDataPtr();
5072 #ifdef NOT_YET
5073       Uint32 instanceNo =
5074         blockToInstance(signal->header.theReceiversBlockNumber);
5075       treeNodePtr.p->m_send.m_ref = numberToRef(DBLQH,
5076                                                 instanceNo, getOwnNodeId());
5077 #else
5078       treeNodePtr.p->m_send.m_ref =
5079         numberToRef(DBLQH, getInstanceKey(src->tableSchemaVersion & 0xFFFF,
5080                                           src->fragmentData & 0xFFFF),
5081                     getOwnNodeId());
5082 #endif
5083 
5084       Uint32 hashValue = src->hashValue;
5085       Uint32 fragId = src->fragmentData;
5086       Uint32 attrLen = src->attrLen; // fragdist-key is in here
5087 
5088       /**
5089        * assertions
5090        */
5091 #ifdef VM_TRACE
5092       Uint32 requestInfo = src->requestInfo;
5093       ndbassert(LqhKeyReq::getAttrLen(attrLen) == 0);         // Only long
5094       ndbassert(LqhKeyReq::getScanTakeOverFlag(attrLen) == 0);// Not supported
5095       ndbassert(LqhKeyReq::getReorgFlag(attrLen) == ScanFragReq::REORG_ALL);       // Not supported
5096       ndbassert(LqhKeyReq::getOperation(requestInfo) == ZREAD);
5097       ndbassert(LqhKeyReq::getKeyLen(requestInfo) == 0);      // Only long
5098       ndbassert(LqhKeyReq::getMarkerFlag(requestInfo) == 0);  // Only read
5099       ndbassert(LqhKeyReq::getAIInLqhKeyReq(requestInfo) == 0);
5100       ndbassert(LqhKeyReq::getSeqNoReplica(requestInfo) == 0);
5101       ndbassert(LqhKeyReq::getLastReplicaNo(requestInfo) == 0);
5102       ndbassert(LqhKeyReq::getApplicationAddressFlag(requestInfo) != 0);
5103       ndbassert(LqhKeyReq::getSameClientAndTcFlag(requestInfo) == 0);
5104 #endif
5105 
5106 #ifdef TODO
5107       /**
5108        * Handle various lock-modes
5109        */
5110       static Uint8 getDirtyFlag(const UintR & requestInfo);
5111       static Uint8 getSimpleFlag(const UintR & requestInfo);
5112 #endif
5113 
5114 #ifdef VM_TRACE
5115       Uint32 dst_requestInfo = dst->requestInfo;
5116       ndbassert(LqhKeyReq::getInterpretedFlag(requestInfo) ==
5117                 LqhKeyReq::getInterpretedFlag(dst_requestInfo));
5118       ndbassert(LqhKeyReq::getNoDiskFlag(requestInfo) ==
5119                 LqhKeyReq::getNoDiskFlag(dst_requestInfo));
5120 #endif
5121 
5122       dst->hashValue = hashValue;
5123       dst->fragmentData = fragId;
5124       dst->attrLen = attrLen; // fragdist is in here
5125 
5126       treeNodePtr.p->m_bits |= TreeNode::T_ONE_SHOT;
5127     }
5128     return 0;
5129   } while (0);
5130 
5131   return err;
5132 }
5133 
5134 void
lookup_start(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5135 Dbspj::lookup_start(Signal* signal,
5136                     Ptr<Request> requestPtr,
5137                     Ptr<TreeNode> treeNodePtr)
5138 {
5139   lookup_send(signal, requestPtr, treeNodePtr);
5140 }
5141 
5142 void
lookup_send(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5143 Dbspj::lookup_send(Signal* signal,
5144                    Ptr<Request> requestPtr,
5145                    Ptr<TreeNode> treeNodePtr)
5146 {
5147   jam();
5148   if (!ERROR_INSERTED(17521)) // Avoid emulated rnd errors
5149   {
5150     // ::checkTableError() should be handled before we reach this far
5151     ndbassert(checkTableError(treeNodePtr) == 0);
5152   }
5153 
5154   /**
5155    * Count number of expected reply signals:
5156    *  CONF or REF reply:
5157    *  - Expected by every non-leaf TreeNodes
5158    *  - For a scan request evel leaf TreeNodes get a CONF/REF reply.
5159    *
5160    *  TRANSID_AI reply:
5161    *  - Expected for all TreeNodes having T_EXPECT_TRANSID_AI
5162    */
5163   Uint32 cnt = 0;
5164 
5165   if (requestPtr.p->isScan() || !treeNodePtr.p->isLeaf())    //CONF/REF
5166     cnt++;
5167 
5168   if (treeNodePtr.p->m_bits & TreeNode::T_EXPECT_TRANSID_AI) //TRANSID_AI
5169     cnt++;
5170 
5171   LqhKeyReq* req = reinterpret_cast<LqhKeyReq*>(signal->getDataPtrSend());
5172 
5173   memcpy(req, treeNodePtr.p->m_lookup_data.m_lqhKeyReq,
5174          sizeof(treeNodePtr.p->m_lookup_data.m_lqhKeyReq));
5175   req->variableData[2] = treeNodePtr.p->m_send.m_correlation;
5176   req->variableData[3] = requestPtr.p->m_rootResultData;
5177 
5178   if (!treeNodePtr.p->isLeaf() || requestPtr.p->isScan())
5179   {
5180     // Non-LEAF want reply to SPJ instead of ApiClient.
5181     LqhKeyReq::setNormalProtocolFlag(req->requestInfo, 1);
5182     req->variableData[0] = reference();
5183     req->variableData[1] = treeNodePtr.i;
5184   }
5185   else
5186   {
5187     jam();
5188     /**
5189      * Fake that TC sent this request,
5190      *   so that it can route a maybe TCKEYREF
5191      */
5192     req->tcBlockref = requestPtr.p->m_senderRef;
5193   }
5194 
5195   SectionHandle handle(this);
5196 
5197   Uint32 ref = treeNodePtr.p->m_send.m_ref;
5198   Uint32 keyInfoPtrI = treeNodePtr.p->m_send.m_keyInfoPtrI;
5199   Uint32 attrInfoPtrI = treeNodePtr.p->m_send.m_attrInfoPtrI;
5200 
5201   Uint32 err = 0;
5202 
5203   do
5204   {
5205     if (treeNodePtr.p->m_bits & TreeNode::T_ONE_SHOT)
5206     {
5207       jam();
5208       /**
5209        * Pass sections to send
5210        */
5211       treeNodePtr.p->m_send.m_attrInfoPtrI = RNIL;
5212       treeNodePtr.p->m_send.m_keyInfoPtrI = RNIL;
5213     }
5214     else
5215     {
5216       if ((treeNodePtr.p->m_bits & TreeNode::T_KEYINFO_CONSTRUCTED) == 0)
5217       {
5218         jam();
5219         Uint32 tmp = RNIL;
5220         if (!dupSection(tmp, keyInfoPtrI))
5221         {
5222           jam();
5223           ndbassert(tmp == RNIL);  // Guard for memleak
5224           err = DbspjErr::OutOfSectionMemory;
5225           break;
5226         }
5227 
5228         keyInfoPtrI = tmp;
5229       }
5230       else
5231       {
5232         jam();
5233         treeNodePtr.p->m_send.m_keyInfoPtrI = RNIL;
5234       }
5235 
5236       if ((treeNodePtr.p->m_bits & TreeNode::T_ATTRINFO_CONSTRUCTED) == 0)
5237       {
5238         jam();
5239         Uint32 tmp = RNIL;
5240 
5241         /**
5242          * Test execution terminated due to 'OutOfSectionMemory' which
5243          * may happen for different treeNodes in the request:
5244          * - 17070: Fail on any lookup_send()
5245          * - 17071: Fail on lookup_send() if 'isLeaf'
5246          * - 17072: Fail on lookup_send() if treeNode not root
5247          */
5248         if (ERROR_INSERTED(17070) ||
5249            (ERROR_INSERTED(17071) && treeNodePtr.p->isLeaf()) ||
5250            (ERROR_INSERTED(17072) && treeNodePtr.p->m_parentPtrI != RNIL))
5251         {
5252           jam();
5253           CLEAR_ERROR_INSERT_VALUE;
5254           ndbout_c("Injecting OutOfSectionMemory error at line %d file %s",
5255                    __LINE__,  __FILE__);
5256           releaseSection(keyInfoPtrI);
5257           err = DbspjErr::OutOfSectionMemory;
5258           break;
5259         }
5260 
5261         if (!dupSection(tmp, attrInfoPtrI))
5262         {
5263           jam();
5264           ndbassert(tmp == RNIL);  // Guard for memleak
5265           releaseSection(keyInfoPtrI);
5266           err = DbspjErr::OutOfSectionMemory;
5267           break;
5268         }
5269 
5270         attrInfoPtrI = tmp;
5271       }
5272       else
5273       {
5274         jam();
5275         treeNodePtr.p->m_send.m_attrInfoPtrI = RNIL;
5276       }
5277     }
5278 
5279     getSection(handle.m_ptr[0], keyInfoPtrI);
5280     getSection(handle.m_ptr[1], attrInfoPtrI);
5281     handle.m_cnt = 2;
5282 
5283     /**
5284      * Inject error to test LQHKEYREF handling:
5285      * Tampering with tableSchemaVersion such that LQH will
5286      * return LQHKEYREF('1227: Invalid schema version')
5287      * May happen for different treeNodes in the request:
5288      * - 17030: Fail on any lookup_send()
5289      * - 17031: Fail on lookup_send() if 'isLeaf'
5290      * - 17032: Fail on lookup_send() if treeNode not root
5291      */
5292     if (ERROR_INSERTED(17030) ||
5293        (ERROR_INSERTED(17031) && treeNodePtr.p->isLeaf()) ||
5294        (ERROR_INSERTED(17032) && treeNodePtr.p->m_parentPtrI != RNIL))
5295     {
5296       jam();
5297       CLEAR_ERROR_INSERT_VALUE;
5298       req->tableSchemaVersion += (1 << 16); // Provoke 'Invalid schema version'
5299     }
5300 
5301 #if defined DEBUG_LQHKEYREQ
5302     ndbout_c("LQHKEYREQ to %x", ref);
5303     printLQHKEYREQ(stdout, signal->getDataPtrSend(),
5304                    NDB_ARRAY_SIZE(treeNodePtr.p->m_lookup_data.m_lqhKeyReq),
5305                    DBLQH);
5306     printf("KEYINFO: ");
5307     print(handle.m_ptr[0], stdout);
5308     printf("ATTRINFO: ");
5309     print(handle.m_ptr[1], stdout);
5310 #endif
5311 
5312     Uint32 Tnode = refToNode(ref);
5313     if (Tnode == getOwnNodeId())
5314     {
5315       c_Counters.incr_counter(CI_LOCAL_READS_SENT, 1);
5316     }
5317     else
5318     {
5319       ndbrequire(!ERROR_INSERTED(17014));
5320 
5321       c_Counters.incr_counter(CI_REMOTE_READS_SENT, 1);
5322     }
5323 
5324     /**
5325      * Test correct abort handling if datanode not (yet)
5326      * connected to requesting API node.
5327      */
5328     if (ERROR_INSERTED(17530) &&
5329         !getNodeInfo(getResultRef(requestPtr)).m_connected)
5330     {
5331       jam();
5332       releaseSections(handle);
5333       err = DbspjErr::OutOfSectionMemory; //Fake an error likely seen here
5334       break;
5335     }
5336 
5337     /**
5338      * Test execution terminated due to 'NodeFailure' which
5339      * may happen for different treeNodes in the request:
5340      * - 17020: Fail on any lookup_send()
5341      * - 17021: Fail on lookup_send() if 'isLeaf'
5342      * - 17022: Fail on lookup_send() if treeNode not root
5343      */
5344     if (ERROR_INSERTED(17020) ||
5345        (ERROR_INSERTED(17021) && treeNodePtr.p->isLeaf()) ||
5346        (ERROR_INSERTED(17022) && treeNodePtr.p->m_parentPtrI != RNIL))
5347     {
5348       jam();
5349       CLEAR_ERROR_INSERT_VALUE;
5350       releaseSections(handle);
5351       err = DbspjErr::NodeFailure;
5352       break;
5353     }
5354 
5355     if (unlikely(!c_alive_nodes.get(Tnode)))
5356     {
5357       jam();
5358       releaseSections(handle);
5359       err = DbspjErr::NodeFailure;
5360       break;
5361     }
5362     else if (cnt > 0)
5363     {
5364       // Register signal 'cnt' required before completion
5365       jam();
5366       ndbassert(Tnode < NDB_ARRAY_SIZE(requestPtr.p->m_lookup_node_data));
5367       requestPtr.p->m_completed_tree_nodes.clear(treeNodePtr.p->m_node_no);
5368       requestPtr.p->m_outstanding += cnt;
5369       requestPtr.p->m_lookup_node_data[Tnode] += cnt;
5370       // number wrapped
5371       ndbrequire(requestPtr.p->m_lookup_node_data[Tnode] != 0);
5372     }
5373 
5374     sendSignal(ref, GSN_LQHKEYREQ, signal,
5375                NDB_ARRAY_SIZE(treeNodePtr.p->m_lookup_data.m_lqhKeyReq),
5376                JBB, &handle);
5377 
5378     treeNodePtr.p->m_lookup_data.m_outstanding += cnt;
5379     if (requestPtr.p->isLookup() && treeNodePtr.p->isLeaf())
5380     {
5381       jam();
5382       /**
5383        * Send TCKEYCONF with DirtyReadBit + Tnode,
5384        *   so that API can discover if Tnode died while waiting for result
5385        */
5386       lookup_sendLeafCONF(signal, requestPtr, treeNodePtr, Tnode);
5387     }
5388     return;
5389   }
5390   while (0);
5391 
5392   ndbrequire(err);
5393   jam();
5394   abort(signal, requestPtr, err);
5395 } //Dbspj::lookup_send
5396 
5397 void
lookup_countSignal(const Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 cnt)5398 Dbspj::lookup_countSignal(const Signal* signal,
5399                        Ptr<Request> requestPtr,
5400                        Ptr<TreeNode> treeNodePtr,
5401                        Uint32 cnt)
5402 {
5403   jam();
5404   const Uint32 Tnode = refToNode(signal->getSendersBlockRef());
5405 
5406   ndbassert(requestPtr.p->m_lookup_node_data[Tnode] >= cnt);
5407   requestPtr.p->m_lookup_node_data[Tnode] -= cnt;
5408 
5409   ndbassert(requestPtr.p->m_outstanding >= cnt);
5410   requestPtr.p->m_outstanding -= cnt;
5411 
5412   ndbassert(treeNodePtr.p->m_lookup_data.m_outstanding >= cnt);
5413   treeNodePtr.p->m_lookup_data.m_outstanding -= cnt;
5414 
5415   if (treeNodePtr.p->m_lookup_data.m_outstanding == 0 &&
5416       treeNodePtr.p->m_deferred.isEmpty())
5417   {
5418     jam();
5419     // We have received all rows for this treeNode in this batch.
5420     requestPtr.p->m_completed_tree_nodes.set(treeNodePtr.p->m_node_no);
5421   }
5422 }
5423 
5424 void
lookup_execLQHKEYREF(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5425 Dbspj::lookup_execLQHKEYREF(Signal* signal,
5426                             Ptr<Request> requestPtr,
5427                             Ptr<TreeNode> treeNodePtr)
5428 {
5429   jam();
5430   const LqhKeyRef * rep = (LqhKeyRef*)signal->getDataPtr();
5431   const Uint32 errCode = rep->errorCode;
5432 
5433   c_Counters.incr_counter(CI_READS_NOT_FOUND, 1);
5434 
5435   DEBUG("lookup_execLQHKEYREF, errorCode:" << errCode);
5436 
5437   if (treeNodePtr.p->m_bits & TreeNode::T_EXPECT_TRANSID_AI)
5438   {
5439     // Count(==2) the REF and the non-arriving TRANSID_AI
5440     lookup_countSignal(signal, requestPtr, treeNodePtr, 2);
5441   }
5442   else
5443   {
5444     // Count(==1) only awaiting CONF/REF
5445     lookup_countSignal(signal, requestPtr, treeNodePtr, 1);
5446   }
5447 
5448   /**
5449    * If Request is still actively running: API need to
5450    * be informed about error.
5451    * Error code may either indicate a 'hard error' which should
5452    * terminate the query execution, or a 'soft error' which
5453    * should be signaled NDBAPI, and execution continued.
5454    */
5455   if (likely((requestPtr.p->m_state & Request::RS_ABORTING) == 0))
5456   {
5457     switch(errCode){
5458     case 626: // 'Soft error' : Row not found
5459     case 899: // 'Soft error' : Interpreter_exit_nok
5460 
5461       jam();
5462       /**
5463        * Only Lookup-request need to send TCKEYREF...
5464        */
5465       if (requestPtr.p->isLookup())
5466       {
5467         jam();
5468         lookup_stop_branch(signal, requestPtr, treeNodePtr, errCode);
5469       }
5470       break;
5471 
5472     default: // 'Hard error' : abort query
5473       jam();
5474       abort(signal, requestPtr, errCode);
5475       return;
5476     }
5477   }
5478 
5479   /**
5480    * Another TreeNode awaited for completion of this request
5481    * before it could resume its operation.
5482    */
5483   if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_REF)
5484   {
5485     jam();
5486     ndbassert(treeNodePtr.p->m_resumePtrI != RNIL);
5487     Ptr<TreeNode> resumeTreeNodePtr;
5488     m_treenode_pool.getPtr(resumeTreeNodePtr, treeNodePtr.p->m_resumePtrI);
5489     lookup_resume(signal, requestPtr, resumeTreeNodePtr);
5490   }
5491 
5492   if (requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no))
5493   {
5494     jam();
5495     // We have received all rows for this treeNode in this batch.
5496     handleTreeNodeComplete(signal, requestPtr, treeNodePtr);
5497   }
5498 }
5499 
5500 /**
5501  * lookup_stop_branch() will send required signals to the API
5502  * to inform that the query branch starting with 'treeNodePtr'
5503  * will not be executed due to 'errCode'.
5504  *
5505  * NOTE: 'errCode' is expected to be a 'soft error', like
5506  *       'row not found', and is *not* intended to abort
5507  *       entire query.
5508  */
5509 void
lookup_stop_branch(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 errCode)5510 Dbspj::lookup_stop_branch(Signal* signal,
5511                           Ptr<Request> requestPtr,
5512                           Ptr<TreeNode> treeNodePtr,
5513                           Uint32 errCode)
5514 {
5515   ndbassert(requestPtr.p->isLookup());
5516   DEBUG("::lookup_stop_branch"
5517      << ", node: " << treeNodePtr.p->m_node_no
5518   );
5519 
5520   /**
5521    * If this is a "leaf" node, either on its own, or
5522    * indirectly through an unique index lookup:
5523    * Ordinary operation would have emited extra TCKEYCONF
5524    * required for nodefail handling.
5525    * (In case of nodefails during final leaf REQs).
5526    * As API cant, or at least does not try to, tell whether
5527    * leaf operation is REFed by SPJ or LQH, we still have to
5528    * send this extra CONF as required by protocoll.
5529    */
5530   if (treeNodePtr.p->isLeaf())
5531   {
5532     jam();
5533     DEBUG("  Leaf-lookup: sending extra 'CONF' for nodefail handling");
5534     lookup_sendLeafCONF(signal, requestPtr, treeNodePtr, getOwnNodeId());
5535   }
5536 
5537   else if (treeNodePtr.p->m_bits & TreeNode::T_UNIQUE_INDEX_LOOKUP)
5538   {
5539     /**
5540      * UNIQUE_INDEX lookups are represented with an additional
5541      * child which does the lookup from UQ-index into the table
5542      * itself. Has to check this child for being 'leaf'.
5543      */
5544     LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
5545     Local_dependency_map list(pool, treeNodePtr.p->m_child_nodes);
5546     Dependency_map::ConstDataBufferIterator it;
5547     ndbrequire(list.first(it));
5548     ndbrequire(list.getSize() == 1); // should only be 1 child
5549     Ptr<TreeNode> childPtr;
5550     m_treenode_pool.getPtr(childPtr, * it.data);
5551     if (childPtr.p->isLeaf())
5552     {
5553       jam();
5554       DEBUG("  UNUQUE_INDEX-Leaf-lookup: sending extra 'CONF' "
5555             "for nodefail handling");
5556       lookup_sendLeafCONF(signal, requestPtr, childPtr, getOwnNodeId());
5557     }
5558   }
5559 
5560   /**
5561    * Then produce the REF(errCode) which terminates this
5562    * tree branch.
5563    */
5564   const Uint32 resultRef = treeNodePtr.p->m_lookup_data.m_api_resultRef;
5565   const Uint32 resultData = treeNodePtr.p->m_lookup_data.m_api_resultData;
5566   TcKeyRef* ref = (TcKeyRef*)signal->getDataPtr();
5567   ref->connectPtr = resultData;
5568   ref->transId[0] = requestPtr.p->m_transId[0];
5569   ref->transId[1] = requestPtr.p->m_transId[1];
5570   ref->errorCode = errCode;
5571   ref->errorData = 0;
5572 
5573   DEBUG("  send TCKEYREF");
5574   sendTCKEYREF(signal, resultRef, requestPtr.p->m_senderRef);
5575 }
5576 
5577 /**
5578  * Lookup leafs in lookup requests will not receive CONF/REF
5579  * back to SPJ when LQH request has completed. Instead we
5580  * will cleanup() the request when the last leafnode KEYREQ
5581  * has been sent. If any of the REQuested datanodes fails
5582  * after this, SPJ will not detect this and be able to
5583  * send appropriate signals to the API to awake it from the
5584  * 'wait' state.
5585  * To get around this, we instead send an extra CONF
5586  * to the API which inform it about which 'node' it should
5587  * expect a result from. API can then discover if this
5588  * 'node' died while waiting for results.
5589  */
5590 void
lookup_sendLeafCONF(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 node)5591 Dbspj::lookup_sendLeafCONF(Signal* signal,
5592                            Ptr<Request> requestPtr,
5593                            Ptr<TreeNode> treeNodePtr,
5594                            Uint32 node)
5595 {
5596   ndbassert(treeNodePtr.p->isLeaf());
5597 
5598   const Uint32 resultRef = treeNodePtr.p->m_lookup_data.m_api_resultRef;
5599   const Uint32 resultData = treeNodePtr.p->m_lookup_data.m_api_resultData;
5600   TcKeyConf* const conf = (TcKeyConf*)signal->getDataPtr();
5601   conf->apiConnectPtr = RNIL;
5602   conf->confInfo = 0;
5603   conf->gci_hi = 0;
5604   TcKeyConf::setNoOfOperations(conf->confInfo, 1);
5605   conf->transId1 = requestPtr.p->m_transId[0];
5606   conf->transId2 = requestPtr.p->m_transId[1];
5607   conf->operations[0].apiOperationPtr = resultData;
5608   conf->operations[0].attrInfoLen =
5609     TcKeyConf::DirtyReadBit | node;
5610   const Uint32 sigLen = TcKeyConf::StaticLength + TcKeyConf::OperationLength;
5611   sendTCKEYCONF(signal, sigLen, resultRef, requestPtr.p->m_senderRef);
5612 }
5613 
5614 
5615 void
lookup_execLQHKEYCONF(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5616 Dbspj::lookup_execLQHKEYCONF(Signal* signal,
5617                              Ptr<Request> requestPtr,
5618                              Ptr<TreeNode> treeNodePtr)
5619 {
5620   ndbrequire(!(requestPtr.p->isLookup() && treeNodePtr.p->isLeaf()));
5621 
5622   if (treeNodePtr.p->m_bits & TreeNode::T_USER_PROJECTION)
5623   {
5624     jam();
5625     requestPtr.p->m_rows++;
5626   }
5627 
5628   // Count awaiting CONF. If non-leaf, there will also be a TRANSID_AI
5629   lookup_countSignal(signal, requestPtr, treeNodePtr, 1);
5630 
5631   /**
5632    * Another TreeNode awaited for completion of this request
5633    * before it could resume its operation.
5634    */
5635   if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_CONF)
5636   {
5637     jam();
5638     ndbassert(treeNodePtr.p->m_resumePtrI != RNIL);
5639     Ptr<TreeNode> resumeTreeNodePtr;
5640     m_treenode_pool.getPtr(resumeTreeNodePtr, treeNodePtr.p->m_resumePtrI);
5641     lookup_resume(signal, requestPtr, resumeTreeNodePtr);
5642   }
5643 
5644   if (requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no))
5645   {
5646     jam();
5647     // We have received all rows for this treeNode in this batch.
5648     handleTreeNodeComplete(signal, requestPtr, treeNodePtr);
5649   }
5650 }
5651 
5652 void
lookup_parent_row(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const RowPtr & rowRef)5653 Dbspj::lookup_parent_row(Signal* signal,
5654                          Ptr<Request> requestPtr,
5655                          Ptr<TreeNode> treeNodePtr,
5656                          const RowPtr & rowRef)
5657 {
5658   jam();
5659 
5660   DEBUG("::lookup_parent_row"
5661      << ", node: " << treeNodePtr.p->m_node_no);
5662   lookup_row(signal, requestPtr, treeNodePtr, rowRef);
5663 } // Dbspj::lookup_parent_row()
5664 
5665 /**
5666  * lookup_resume() is a delayed lookup_parent_row.
5667  * It will locate the next parent row now allowed to execute,
5668  * and create a child lookup request for that row.
5669  */
5670 void
lookup_resume(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5671 Dbspj::lookup_resume(Signal* signal,
5672                      Ptr<Request> requestPtr,
5673                      Ptr<TreeNode> treeNodePtr)
5674 {
5675   jam();
5676   DEBUG("::lookup_resume"
5677      << ", node: " << treeNodePtr.p->m_node_no
5678   );
5679 
5680   ndbassert(treeNodePtr.p->m_parentPtrI != RNIL);
5681   Ptr<TreeNode> parentPtr;
5682   m_treenode_pool.getPtr(parentPtr, treeNodePtr.p->m_parentPtrI);
5683 
5684   if (unlikely(requestPtr.p->m_state & Request::RS_ABORTING))
5685   {
5686     jam();
5687     return;
5688   }
5689   ndbassert(!treeNodePtr.p->m_deferred.isEmpty());
5690   ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
5691 
5692   Uint32 corrVal;
5693   {
5694     LocalArenaPool<DataBufferSegment<14> > pool(treeNodePtr.p->m_batchArena, m_dependency_map_pool);
5695     Local_correlation_list correlations(pool, treeNodePtr.p->m_deferred.m_correlations);
5696 
5697     Local_correlation_list::DataBufferIterator it;
5698     const bool valid = correlations.position(it, (Uint32)(treeNodePtr.p->m_deferred.m_pos++));
5699     (void)valid; ndbassert(valid);
5700     corrVal = *it.data;
5701   }
5702 
5703   // Set up RowPtr & RowRef for this parent row
5704   RowPtr row;
5705   row.m_src_node_ptrI = parentPtr.i;
5706   row.m_src_correlation = corrVal;
5707 
5708   ndbassert(parentPtr.p->m_rows.m_type == RowCollection::COLLECTION_MAP);
5709   RowRef ref;
5710   parentPtr.p->m_rows.m_map.copyto(ref);
5711   const Uint32* const mapptr = get_row_ptr(ref);
5712 
5713   // Relocate parent row from correlation value.
5714   const Uint32 rowId = (corrVal & 0xFFFF);
5715   parentPtr.p->m_rows.m_map.load(mapptr, rowId, ref);
5716 
5717   const Uint32* const rowptr = get_row_ptr(ref);
5718   setupRowPtr(parentPtr, row, ref, rowptr);
5719 
5720   lookup_row(signal, requestPtr, treeNodePtr, row);
5721 } // Dbspj::lookup_resume()
5722 
5723 void
lookup_row(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const RowPtr & rowRef)5724 Dbspj::lookup_row(Signal* signal,
5725                          Ptr<Request> requestPtr,
5726                          Ptr<TreeNode> treeNodePtr,
5727                          const RowPtr & rowRef)
5728 {
5729   jam();
5730 
5731   /**
5732    * Here we need to...
5733    *   1) construct a key
5734    *   2) compute hash     (normally TC)
5735    *   3) get node for row (normally TC)
5736    */
5737   Uint32 err = 0;
5738   const Uint32 tableId = treeNodePtr.p->m_tableOrIndexId;
5739   const Uint32 corrVal = rowRef.m_src_correlation;
5740 
5741   DEBUG("::lookup_row"
5742      << ", node: " << treeNodePtr.p->m_node_no);
5743 
5744   do
5745   {
5746     err = checkTableError(treeNodePtr);
5747     if (unlikely(err != 0))
5748     {
5749       jam();
5750       break;
5751     }
5752 
5753     /**
5754      * Test execution terminated due to 'OutOfQueryMemory' which
5755      * may happen multiple places below:
5756      * - 17040: Fail on any lookup_parent_row()
5757      * - 17041: Fail on lookup_parent_row() if 'isLeaf'
5758      * - 17042: Fail on lookup_parent_row() if treeNode not root
5759      * - 17043: Fail after last outstanding signal received.
5760      */
5761     if (ERROR_INSERTED(17040) ||
5762        (ERROR_INSERTED(17041) && treeNodePtr.p->isLeaf()) ||
5763        (ERROR_INSERTED(17042) && treeNodePtr.p->m_parentPtrI != RNIL) ||
5764        (ERROR_INSERTED(17043) && requestPtr.p->m_outstanding == 0))
5765     {
5766       jam();
5767       CLEAR_ERROR_INSERT_VALUE;
5768       err = DbspjErr::OutOfQueryMemory;
5769       break;
5770     }
5771 
5772     Uint32 ptrI = RNIL;
5773     if (treeNodePtr.p->m_bits & TreeNode::T_KEYINFO_CONSTRUCTED)
5774     {
5775       jam();
5776       DEBUG("parent_row w/ T_KEYINFO_CONSTRUCTED");
5777       /**
5778        * Get key-pattern
5779        */
5780       LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
5781       Local_pattern_store pattern(pool, treeNodePtr.p->m_keyPattern);
5782 
5783       bool keyIsNull;
5784       err = expand(ptrI, pattern, rowRef, keyIsNull);
5785       if (unlikely(err != 0))
5786       {
5787         jam();
5788         releaseSection(ptrI);
5789         break;
5790       }
5791 
5792       if (keyIsNull)
5793       {
5794         /**
5795          * When the key contains NULL values, an EQ-match is impossible!
5796          * Entire lookup request can therefore be eliminate as it is known
5797          * to be REFused with errorCode = 626 (Row not found).
5798          *
5799          * Scan requests can simply ignore these child LQHKEYREQs
5800          * as REFs are not needed, either by the API protocoll,
5801          * or in order to handle TN_RESUME_REF.
5802          *
5803          * Lookup requests has to send the same KEYREFs as would have
5804          * been produced by LQH.
5805          */
5806         jam();
5807         DEBUG("Key contain NULL values: Ignore impossible KEYREQ");
5808         releaseSection(ptrI);
5809         ptrI = RNIL;
5810 
5811         /* count(==0) the not sent signal to update completion status */
5812         lookup_countSignal(signal, requestPtr, treeNodePtr, 0);
5813 
5814         /* Send KEYREF(errCode=626) as required by lookup request protocol */
5815         if (requestPtr.p->isLookup())
5816         {
5817           jam();
5818           lookup_stop_branch(signal, requestPtr, treeNodePtr, 626);
5819         }
5820 
5821         /**
5822          * Another TreeNode awaited completion of this treeNode
5823          * or sub-branch before it could resume its operation.
5824          */
5825         if ((treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_REF))
5826         {
5827           jam();
5828           DEBUG("handling TN_RESUME_REF");
5829           ndbassert(treeNodePtr.p->m_resumePtrI != RNIL);
5830           Ptr<TreeNode> resumeTreeNodePtr;
5831           m_treenode_pool.getPtr(resumeTreeNodePtr, treeNodePtr.p->m_resumePtrI);
5832           lookup_resume(signal, requestPtr, resumeTreeNodePtr);
5833         }
5834 
5835         /**
5836          * This possibly completed this treeNode, handle it.
5837          */
5838         if (requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no))
5839         {
5840           jam();
5841           handleTreeNodeComplete(signal, requestPtr, treeNodePtr);
5842         }
5843 
5844         return;  // Bailout, KEYREQ would have returned KEYREF(626) anyway
5845       } // keyIsNull
5846 
5847       ndbassert(ptrI != RNIL);
5848       treeNodePtr.p->m_send.m_keyInfoPtrI = ptrI;
5849     } //T_KEYINFO_CONSTRUCTED
5850 
5851     BuildKeyReq tmp;
5852     err = computeHash(signal, tmp, tableId, treeNodePtr.p->m_send.m_keyInfoPtrI);
5853     if (unlikely(err != 0))
5854       break;
5855 
5856     err = getNodes(signal, tmp, tableId);
5857     if (unlikely(err != 0))
5858       break;
5859 
5860     Uint32 attrInfoPtrI = treeNodePtr.p->m_send.m_attrInfoPtrI;
5861     if (treeNodePtr.p->m_bits & TreeNode::T_ATTRINFO_CONSTRUCTED)
5862     {
5863       jam();
5864       Uint32 tmp = RNIL;
5865 
5866       /**
5867        * Test execution terminated due to 'OutOfSectionMemory' which
5868        * may happen for different treeNodes in the request:
5869        * - 17080: Fail on lookup_parent_row
5870        * - 17081: Fail on lookup_parent_row: if 'isLeaf'
5871        * - 17082: Fail on lookup_parent_row: if treeNode not root
5872        */
5873       if (ERROR_INSERTED(17080) ||
5874          (ERROR_INSERTED(17081) && treeNodePtr.p->isLeaf()) ||
5875          (ERROR_INSERTED(17082) && treeNodePtr.p->m_parentPtrI != RNIL))
5876       {
5877         jam();
5878         CLEAR_ERROR_INSERT_VALUE;
5879         ndbout_c("Injecting OutOfSectionMemory error at line %d file %s",
5880                  __LINE__,  __FILE__);
5881         err = DbspjErr::OutOfSectionMemory;
5882         break;
5883       }
5884 
5885       if (!dupSection(tmp, attrInfoPtrI))
5886       {
5887         jam();
5888         ndbassert(tmp == RNIL);  // Guard for memleak
5889         err = DbspjErr::OutOfSectionMemory;
5890         break;
5891       }
5892 
5893       Uint32 org_size;
5894       {
5895         SegmentedSectionPtr ptr;
5896         getSection(ptr, tmp);
5897         org_size = ptr.sz;
5898       }
5899 
5900       bool hasNull;
5901       LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
5902       Local_pattern_store pattern(pool, treeNodePtr.p->m_attrParamPattern);
5903       err = expand(tmp, pattern, rowRef, hasNull);
5904       if (unlikely(err != 0))
5905       {
5906         jam();
5907         releaseSection(tmp);
5908         break;
5909       }
5910 //    ndbrequire(!hasNull);
5911 
5912       /**
5913        * Update size of subsrouting section, which contains arguments
5914        */
5915       SegmentedSectionPtr ptr;
5916       getSection(ptr, tmp);
5917       Uint32 new_size = ptr.sz;
5918       Uint32 * sectionptrs = ptr.p->theData;
5919       sectionptrs[4] = new_size - org_size;
5920 
5921       treeNodePtr.p->m_send.m_attrInfoPtrI = tmp;
5922     }
5923 
5924     /**
5925      * Now send...
5926      */
5927 
5928     /**
5929      * TODO merge better with lookup_start (refactor)
5930      */
5931     {
5932       /* We set the upper half word of m_correlation to the tuple ID
5933        * of the parent, such that the API can match this tuple with its
5934        * parent.
5935        * Then we re-use the tuple ID of the parent as the
5936        * tuple ID for this tuple also. Since the tuple ID
5937        * is unique within this batch and SPJ block for the parent operation,
5938        * it must also be unique for this operation.
5939        * This ensures that lookup operations with no user projection will
5940        * work, since such operations will have the same tuple ID as their
5941        * parents. The API will then be able to match a tuple with its
5942        * grandparent, even if it gets no tuple for the parent operation.*/
5943       treeNodePtr.p->m_send.m_correlation =
5944         (corrVal << 16) + (corrVal & 0xffff);
5945 
5946       treeNodePtr.p->m_send.m_ref = tmp.receiverRef;
5947       LqhKeyReq * dst = (LqhKeyReq*)treeNodePtr.p->m_lookup_data.m_lqhKeyReq;
5948       dst->hashValue = tmp.hashInfo[0];
5949       dst->fragmentData = tmp.fragId;
5950       Uint32 attrLen = 0;
5951       LqhKeyReq::setDistributionKey(attrLen, tmp.fragDistKey);
5952       dst->attrLen = attrLen;
5953       lookup_send(signal, requestPtr, treeNodePtr);
5954 
5955       if (treeNodePtr.p->m_bits & TreeNode::T_ATTRINFO_CONSTRUCTED)
5956       {
5957         jam();
5958         // restore
5959         treeNodePtr.p->m_send.m_attrInfoPtrI = attrInfoPtrI;
5960       }
5961     }
5962     return;
5963   } while (0);
5964 
5965   // If we fail it will always be a 'hard error' -> abort
5966   ndbrequire(err);
5967   jam();
5968   abort(signal, requestPtr, err);
5969 }
5970 
5971 void
lookup_abort(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5972 Dbspj::lookup_abort(Signal* signal,
5973                     Ptr<Request> requestPtr,
5974                     Ptr<TreeNode> treeNodePtr)
5975 {
5976   jam();
5977   // Correlation ids for deferred operations are allocated in the batch specific
5978   // arena. It is sufficient to release entire memory arena.
5979   m_arenaAllocator.release(treeNodePtr.p->m_batchArena);
5980   treeNodePtr.p->m_deferred.init();
5981 }
5982 
5983 Uint32
lookup_execNODE_FAILREP(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const NdbNodeBitmask mask)5984 Dbspj::lookup_execNODE_FAILREP(Signal* signal,
5985                                Ptr<Request> requestPtr,
5986                                Ptr<TreeNode> treeNodePtr,
5987                                const NdbNodeBitmask mask)
5988 {
5989   jam();
5990   Uint32 node = 0;
5991   Uint32 sum = 0;
5992   while (requestPtr.p->m_outstanding &&
5993          ((node = mask.find(node + 1)) != NdbNodeBitmask::NotFound))
5994   {
5995     Uint32 cnt = requestPtr.p->m_lookup_node_data[node];
5996     sum += cnt;
5997     requestPtr.p->m_lookup_node_data[node] = 0;
5998   }
5999 
6000   if (sum)
6001   {
6002     jam();
6003     ndbrequire(requestPtr.p->m_outstanding >= sum);
6004     requestPtr.p->m_outstanding -= sum;
6005   }
6006 
6007   return sum;
6008 }
6009 
6010 void
lookup_cleanup(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)6011 Dbspj::lookup_cleanup(Ptr<Request> requestPtr,
6012                       Ptr<TreeNode> treeNodePtr)
6013 {
6014   cleanup_common(requestPtr, treeNodePtr);
6015 }
6016 
6017 
6018 Uint32
handle_special_hash(Uint32 tableId,Uint32 dstHash[4],const Uint64 * src,Uint32 srcLen,const KeyDescriptor * desc)6019 Dbspj::handle_special_hash(Uint32 tableId, Uint32 dstHash[4],
6020                            const Uint64* src,
6021                            Uint32 srcLen,       // Len in #32bit words
6022                            const KeyDescriptor* desc)
6023 {
6024   const Uint32 MAX_KEY_SIZE_IN_LONG_WORDS=
6025     (MAX_KEY_SIZE_IN_WORDS + 1) / 2;
6026   Uint64 alignedWorkspace[MAX_KEY_SIZE_IN_LONG_WORDS * MAX_XFRM_MULTIPLY];
6027   const bool hasVarKeys = desc->noOfVarKeys > 0;
6028   const bool hasCharAttr = desc->hasCharAttr;
6029   const bool compute_distkey = desc->noOfDistrKeys > 0;
6030 
6031   const Uint64 *hashInput = 0;
6032   Uint32 inputLen = 0;
6033   Uint32 keyPartLen[MAX_ATTRIBUTES_IN_INDEX];
6034   Uint32 * keyPartLenPtr;
6035 
6036   /* Normalise KeyInfo into workspace if necessary */
6037   if (hasCharAttr || (compute_distkey && hasVarKeys))
6038   {
6039     hashInput = alignedWorkspace;
6040     keyPartLenPtr = keyPartLen;
6041     inputLen = xfrm_key_hash(tableId,
6042                              (Uint32*)src,
6043                              (Uint32*)alignedWorkspace,
6044                              sizeof(alignedWorkspace) >> 2,
6045                              keyPartLenPtr);
6046     if (unlikely(inputLen == 0))
6047     {
6048       return 290;  // 'Corrupt key in TC, unable to xfrm'
6049     }
6050   }
6051   else
6052   {
6053     /* Keyinfo already suitable for hash */
6054     hashInput = src;
6055     inputLen = srcLen;
6056     keyPartLenPtr = 0;
6057   }
6058 
6059   /* Calculate primary key hash */
6060   md5_hash(dstHash, hashInput, inputLen);
6061 
6062   /* If the distribution key != primary key then we have to
6063    * form a distribution key from the primary key and calculate
6064    * a separate distribution hash based on this
6065    */
6066   if (compute_distkey)
6067   {
6068     jam();
6069 
6070     Uint32 distrKeyHash[4];
6071     /* Reshuffle primary key columns to get just distribution key */
6072     Uint32 len = create_distr_key(tableId, (Uint32*)hashInput, (Uint32*)alignedWorkspace, keyPartLenPtr);
6073     /* Calculate distribution key hash */
6074     md5_hash(distrKeyHash, alignedWorkspace, len);
6075 
6076     /* Just one word used for distribution */
6077     dstHash[1] = distrKeyHash[1];
6078   }
6079   return 0;
6080 }
6081 
6082 Uint32
computeHash(Signal * signal,BuildKeyReq & dst,Uint32 tableId,Uint32 ptrI)6083 Dbspj::computeHash(Signal* signal,
6084                    BuildKeyReq& dst, Uint32 tableId, Uint32 ptrI)
6085 {
6086   /**
6087    * Essentially the same code as in Dbtc::hash().
6088    * The code for user defined partitioning has been removed though.
6089    */
6090   SegmentedSectionPtr ptr;
6091   getSection(ptr, ptrI);
6092 
6093   /* NOTE:  md5_hash below require 64-bit alignment
6094    */
6095   const Uint32 MAX_KEY_SIZE_IN_LONG_WORDS=
6096     (MAX_KEY_SIZE_IN_WORDS + 1) / 2;
6097   Uint64 tmp64[MAX_KEY_SIZE_IN_LONG_WORDS];
6098   Uint32 *tmp32 = (Uint32*)tmp64;
6099   ndbassert(ptr.sz <= MAX_KEY_SIZE_IN_WORDS);
6100   copy(tmp32, ptr);
6101 
6102   const KeyDescriptor* desc = g_key_descriptor_pool.getPtr(tableId);
6103   ndbrequire(desc != NULL);
6104 
6105   bool need_special_hash = desc->hasCharAttr | (desc->noOfDistrKeys > 0);
6106   if (need_special_hash)
6107   {
6108     jam();
6109     return handle_special_hash(tableId, dst.hashInfo, tmp64, ptr.sz, desc);
6110   }
6111   else
6112   {
6113     jam();
6114     md5_hash(dst.hashInfo, tmp64, ptr.sz);
6115     return 0;
6116   }
6117 }
6118 
6119 /**
6120  * This function differs from computeHash in that *ptrI*
6121  * only contains partition key (packed) and not full primary key
6122  */
6123 Uint32
computePartitionHash(Signal * signal,BuildKeyReq & dst,Uint32 tableId,Uint32 ptrI)6124 Dbspj::computePartitionHash(Signal* signal,
6125                             BuildKeyReq& dst, Uint32 tableId, Uint32 ptrI)
6126 {
6127   SegmentedSectionPtr ptr;
6128   getSection(ptr, ptrI);
6129 
6130   /* NOTE:  md5_hash below require 64-bit alignment
6131    */
6132   const Uint32 MAX_KEY_SIZE_IN_LONG_WORDS=
6133     (MAX_KEY_SIZE_IN_WORDS + 1) / 2;
6134   Uint64 _space[MAX_KEY_SIZE_IN_LONG_WORDS];
6135   Uint64 *tmp64 = _space;
6136   Uint32 *tmp32 = (Uint32*)tmp64;
6137   Uint32 sz = ptr.sz;
6138   ndbassert(ptr.sz <= MAX_KEY_SIZE_IN_WORDS);
6139   copy(tmp32, ptr);
6140 
6141   const KeyDescriptor* desc = g_key_descriptor_pool.getPtr(tableId);
6142   ndbrequire(desc != NULL);
6143 
6144   bool need_xfrm = desc->hasCharAttr || desc->noOfVarKeys;
6145   if (need_xfrm)
6146   {
6147     jam();
6148     /**
6149      * xfrm distribution key
6150      */
6151     Uint32 srcPos = 0;
6152     Uint32 dstPos = 0;
6153     Uint32 * src = tmp32;
6154     Uint32 * dst = signal->theData+24;
6155     for (Uint32 i = 0; i < desc->noOfKeyAttr; i++)
6156     {
6157       const KeyDescriptor::KeyAttr& keyAttr = desc->keyAttr[i];
6158       if (AttributeDescriptor::getDKey(keyAttr.attributeDescriptor))
6159       {
6160         Uint32 attrLen =
6161         xfrm_attr_hash(keyAttr.attributeDescriptor, keyAttr.charsetInfo,
6162                        src, srcPos, dst, dstPos,
6163                        NDB_ARRAY_SIZE(signal->theData) - 24);
6164         if (unlikely(attrLen == 0))
6165         {
6166           DEBUG_CRASH();
6167           return 290;  // 'Corrupt key in TC, unable to xfrm'
6168         }
6169       }
6170     }
6171     tmp64 = (Uint64*)dst;
6172     sz = dstPos;
6173   }
6174 
6175   md5_hash(dst.hashInfo, tmp64, sz);
6176   return 0;
6177 }
6178 
6179 /**
6180  * This method comes in with a list of nodes.
6181  * We have already verified that our own node
6182  * isn't in this list. If we have a node in this
6183  * list that is in the same location domain as
6184  * this node, it will be selected before any
6185  * other node. So we will always try to keep
6186  * the read coming from the same location domain.
6187  *
6188  * To avoid radical imbalances we provide a bit
6189  * of round robin on a node bases. It isn't
6190  * any perfect round robin. We simply rotate a
6191  * bit among the selected nodes instead of
6192  * always selecting the first one we find.
6193  */
6194 Uint32
check_own_location_domain(const Uint32 * nodes,Uint32 end)6195 Dbspj::check_own_location_domain(const Uint32 *nodes,
6196                                  Uint32 end)
6197 {
6198   Uint32 loc_nodes[MAX_NDB_NODES];
6199   Uint32 loc_node_count = 0;
6200   Uint32 my_location_domain_id =
6201     m_location_domain_id[getOwnNodeId()];
6202 
6203   if (my_location_domain_id == 0)
6204   {
6205     jam();
6206     return 0;
6207   }
6208   for (Uint32 i = 0; i < end; i++)
6209   {
6210     jam();
6211     Uint32 node = nodes[i];
6212     ndbrequire(node != 0 && node < MAX_NDB_NODES);
6213     if (my_location_domain_id ==
6214         m_location_domain_id[node])
6215     {
6216       jam();
6217       loc_nodes[loc_node_count++] = node;
6218     }
6219   }
6220   if (loc_node_count != 0)
6221   {
6222     jam();
6223     /**
6224      * If many nodes in the same location domain we will
6225      * spread the load on them by using a very simple load
6226      * balancing routine.
6227      */
6228     m_load_balancer_location++;
6229     Uint32 ret_node = loc_nodes[m_load_balancer_location % loc_node_count];
6230     return ret_node;
6231   }
6232   return 0;
6233 }
6234 
6235 Uint32
getNodes(Signal * signal,BuildKeyReq & dst,Uint32 tableId)6236 Dbspj::getNodes(Signal* signal, BuildKeyReq& dst, Uint32 tableId)
6237 {
6238   TableRecordPtr tablePtr;
6239   tablePtr.i = tableId;
6240   ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
6241 
6242   DiGetNodesReq * req = (DiGetNodesReq *)&signal->theData[0];
6243   req->tableId = tableId;
6244   req->hashValue = dst.hashInfo[1];
6245   req->distr_key_indicator = 0; // userDefinedPartitioning not supported!
6246   req->scan_indicator = 0;
6247   req->anyNode = (tablePtr.p->m_flags & TableRecord::TR_FULLY_REPLICATED) != 0;
6248   req->get_next_fragid_indicator = 0;
6249   req->jamBufferPtr = jamBuffer();
6250 
6251   EXECUTE_DIRECT_MT(DBDIH, GSN_DIGETNODESREQ, signal,
6252                     DiGetNodesReq::SignalLength, 0);
6253   jamEntry();
6254 
6255   DiGetNodesConf * conf = (DiGetNodesConf *)&signal->theData[0];
6256   const Uint32 err = signal->theData[0] ? signal->theData[1] : 0;
6257   Uint32 Tdata2 = conf->reqinfo;
6258   Uint32 nodeId = conf->nodes[0];
6259   Uint32 instanceKey = (Tdata2 >> 24) & 127;
6260 
6261   DEBUG("HASH to nodeId:" << nodeId << ", instanceKey:" << instanceKey);
6262 
6263   jamEntry();
6264   if (unlikely(err != 0))
6265   {
6266     jam();
6267     goto error;
6268   }
6269 
6270   /**
6271    * SPJ only does committed-read (for now)
6272    *   so it's always ok to READ_BACKUP
6273    *   if applicable
6274    *
6275    */
6276   if (nodeId != getOwnNodeId() &&
6277       tablePtr.p->m_flags & TableRecord::TR_READ_BACKUP)
6278   {
6279     /* Node cnt from DIH ignores primary, presumably to fit in 2 bits */
6280     Uint32 cnt = (Tdata2 & 3) + 1;
6281     for (Uint32 i = 1; i < cnt; i++)
6282     {
6283       jam();
6284       if (conf->nodes[i] == getOwnNodeId())
6285       {
6286         jam();
6287         nodeId = getOwnNodeId();
6288         break;
6289       }
6290     }
6291     if (nodeId != getOwnNodeId())
6292     {
6293       Uint32 node;
6294       jam();
6295       if ((node = check_own_location_domain(&conf->nodes[0],
6296                                             cnt)) != 0)
6297       {
6298         nodeId = node;
6299       }
6300     }
6301   }
6302 
6303   dst.fragId = conf->fragId;
6304   dst.fragDistKey = (Tdata2 >> 16) & 255;
6305   dst.receiverRef = numberToRef(DBLQH, instanceKey, nodeId);
6306 
6307   return 0;
6308 
6309 error:
6310   return err;
6311 }
6312 
6313 bool
lookup_checkNode(const Ptr<Request> requestPtr,const Ptr<TreeNode> treeNodePtr)6314 Dbspj::lookup_checkNode(const Ptr<Request> requestPtr,
6315                         const Ptr<TreeNode> treeNodePtr)
6316 {
6317   jam();
6318 
6319   /* TODO */
6320 
6321   return true;
6322 }
6323 
6324 void
lookup_dumpNode(const Ptr<Request> requestPtr,const Ptr<TreeNode> treeNodePtr)6325 Dbspj::lookup_dumpNode(const Ptr<Request> requestPtr,
6326                        const Ptr<TreeNode> treeNodePtr)
6327 {
6328   jam();
6329 
6330   const LookupData& data = treeNodePtr.p->m_lookup_data;
6331 
6332   g_eventLogger->info("DBSPJ %u :       LOOKUP api_resultRef 0x%x "
6333                       "resultData %u outstanding %u",
6334                       instance(),
6335                       data.m_api_resultRef,
6336                       data.m_api_resultData,
6337                       data.m_outstanding);
6338 
6339   /* TODO : Dump LQHKEYREQ */
6340 }
6341 
6342 /**
6343  * END - MODULE LOOKUP
6344  */
6345 
6346 /**
6347  * MODULE SCAN FRAGMENT
6348  *
6349  * NOTE: This may not be root-node
6350  */
6351 const Dbspj::OpInfo
6352 Dbspj::g_ScanFragOpInfo =
6353 {
6354   &Dbspj::scanFrag_build,
6355   &Dbspj::scanFrag_prepare,
6356   &Dbspj::scanFrag_start,
6357   &Dbspj::scanFrag_countSignal,
6358   0, // execLQHKEYREF
6359   0, // execLQHKEYCONF
6360   &Dbspj::scanFrag_execSCAN_FRAGREF,
6361   &Dbspj::scanFrag_execSCAN_FRAGCONF,
6362   &Dbspj::scanFrag_parent_row,
6363   &Dbspj::scanFrag_parent_batch_complete,
6364   &Dbspj::scanFrag_parent_batch_repeat,
6365   &Dbspj::scanFrag_parent_batch_cleanup,
6366   &Dbspj::scanFrag_execSCAN_NEXTREQ,
6367   &Dbspj::scanFrag_complete,
6368   &Dbspj::scanFrag_abort,
6369   &Dbspj::scanFrag_execNODE_FAILREP,
6370   &Dbspj::scanFrag_cleanup,
6371   &Dbspj::scanFrag_checkNode,
6372   &Dbspj::scanFrag_dumpNode
6373 };
6374 
6375 Uint32
scanFrag_build(Build_context & ctx,Ptr<Request> requestPtr,const QueryNode * qn,const QueryNodeParameters * qp)6376 Dbspj::scanFrag_build(Build_context& ctx,
6377                       Ptr<Request> requestPtr,
6378                       const QueryNode* qn,
6379                       const QueryNodeParameters* qp)
6380 {
6381   Uint32 err = 0;
6382   Ptr<TreeNode> treeNodePtr;
6383   const QN_ScanFragNode * node = (const QN_ScanFragNode*)qn;
6384   const QN_ScanFragParameters * param = (const QN_ScanFragParameters*)qp;
6385 
6386   // Only scan requests can have scan-TreeNodes
6387   ndbassert(requestPtr.p->isScan());
6388 
6389   do
6390   {
6391     jam();
6392     err = DbspjErr::InvalidTreeNodeSpecification;
6393     DEBUG("scanFrag_build: len=" << node->len);
6394     if (unlikely(node->len < QN_ScanFragNode::NodeSize))
6395     {
6396       jam();
6397       break;
6398     }
6399 
6400     err = DbspjErr::InvalidTreeParametersSpecification;
6401     DEBUG("param len: " << param->len);
6402     if (unlikely(param->len < QN_ScanFragParameters::NodeSize))
6403     {
6404       jam();
6405       break;
6406     }
6407 
6408     err = createNode(ctx, requestPtr, treeNodePtr);
6409     if (unlikely(err != 0))
6410     {
6411       jam();
6412       break;
6413     }
6414 
6415     const Uint32 treeBits = node->requestInfo;
6416     const Uint32 paramBits = param->requestInfo;
6417     const Uint32 batchRows = param->batch_size_rows;
6418     const Uint32 batchBytes = param->batch_size_bytes;
6419     const Uint32 indexId = node->tableId;
6420     const Uint32 tableId = g_key_descriptor_pool.getPtr(indexId)->primaryTableId;
6421 
6422     treeNodePtr.p->m_info = &g_ScanFragOpInfo;
6423     treeNodePtr.p->m_tableOrIndexId = indexId;
6424     treeNodePtr.p->m_primaryTableId = tableId;
6425     treeNodePtr.p->m_schemaVersion = node->tableVersion;
6426     treeNodePtr.p->m_bits |= TreeNode::T_ATTR_INTERPRETED;
6427     treeNodePtr.p->m_batch_size = batchRows;
6428 
6429     ctx.m_resultData = param->resultData;
6430 
6431     /**
6432      * Parse stuff
6433      */
6434     struct DABuffer nodeDA, paramDA;
6435     nodeDA.ptr = node->optional;
6436     nodeDA.end = nodeDA.ptr + (node->len - QN_ScanFragNode::NodeSize);
6437     paramDA.ptr = param->optional;
6438     paramDA.end = paramDA.ptr + (param->len - QN_ScanFragParameters::NodeSize);
6439 
6440     err = parseScanFrag(ctx, requestPtr, treeNodePtr,
6441                         nodeDA, treeBits, paramDA, paramBits);
6442 
6443     if (unlikely(err != 0))
6444     {
6445       jam();
6446       break;
6447     }
6448 
6449     /**
6450      * If there exists other scan TreeNodes not being among
6451      * my ancestors, results from this scanFrag may be repeated
6452      * as part of an X-scan.
6453      *
6454      * NOTE: The scan nodes being along the left deep ancestor chain
6455      *       are not 'repeatable' as they are driving the
6456      *       repeated X-scan and are thus not repeated themself.
6457      */
6458     if (requestPtr.p->m_bits & Request::RT_REPEAT_SCAN_RESULT &&
6459 	!treeNodePtr.p->m_ancestors.contains(ctx.m_scans))
6460     {
6461       treeNodePtr.p->m_bits |= TreeNode::T_SCAN_REPEATABLE;
6462     }
6463 
6464     ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
6465     ScanFragReq* const dst = reinterpret_cast<ScanFragReq*>(data.m_scanFragReq);
6466 
6467     /**
6468      * The root node get most of its ScanFragReq contents readily
6469      * filled in from the 'start_signal'. So building the initial
6470      * contents of the m_scanFragReq has to be handled different
6471      * for the root node vs. a non-root node.
6472      */
6473     if (ctx.m_start_signal)  //Is the root node?
6474     {
6475       jam();
6476       ndbassert(treeNodePtr.p->m_parentPtrI == RNIL);
6477 
6478       /**
6479        * The REQuest in 'start_signal' contains most of the m_scanFragReq
6480        * readilly filled in. Copy it, and modify where needed.
6481        */
6482       const Signal* signal = ctx.m_start_signal;
6483       const ScanFragReq* const req = reinterpret_cast<const ScanFragReq*>(signal->getDataPtr());
6484       memcpy(dst, req, sizeof(data.m_scanFragReq));
6485 
6486       // Assert some limitations on the SPJ supported ScanFragReq
6487       ndbassert(ScanFragReq::getLockMode(req->requestInfo) == 0);
6488       ndbassert(ScanFragReq::getHoldLockFlag(req->requestInfo) == 0);
6489       ndbassert(ScanFragReq::getKeyinfoFlag(req->requestInfo) == 0);
6490       ndbassert(ScanFragReq::getReadCommittedFlag(req->requestInfo) == 1);
6491       ndbassert(ScanFragReq::getLcpScanFlag(req->requestInfo) == 0);
6492       ndbassert(ScanFragReq::getReorgFlag(req->requestInfo) == ScanFragReq::REORG_ALL);
6493 
6494       /**
6495        * 'NoDiskFlag' should agree with information in treeNode
6496        */
6497       ndbassert(ScanFragReq::getNoDiskFlag(req->requestInfo) ==
6498                 ((treeBits & DABits::NI_LINKED_DISK) == 0 &&
6499                  (paramBits & DABits::PI_DISK_ATTR) == 0));
6500 
6501       ndbassert(dst->savePointId == ctx.m_savepointId);
6502       ndbassert(dst->tableId == node->tableId);
6503       ndbassert(dst->schemaVersion == node->tableVersion);
6504       ndbassert(dst->transId1 == requestPtr.p->m_transId[0]);
6505       ndbassert(dst->transId2 == requestPtr.p->m_transId[1]);
6506 
6507       treeNodePtr.p->m_bits |= TreeNode::T_ONE_SHOT;
6508 
6509       TableRecordPtr tablePtr;
6510       tablePtr.i = treeNodePtr.p->m_tableOrIndexId;
6511       ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
6512       const bool readBackup =
6513         (tablePtr.p->m_flags & TableRecord::TR_READ_BACKUP) != 0;
6514 
6515       data.m_fragCount = 0;
6516 
6517       /**
6518        * As this is the root node, fragId is already contained in the REQuest
6519        * Fill in the set of 'm_fragments' to be SCAN'ed by this REQ.
6520        */
6521       {
6522         Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
6523 
6524         if (ScanFragReq::getMultiFragFlag(req->requestInfo))
6525         {
6526           jam();
6527           Uint32 variableLen = 25;
6528           data.m_fragCount = signal->theData[variableLen++];
6529           for (Uint32 i=0; i < data.m_fragCount; i++)
6530           {
6531             jam();
6532             Ptr<ScanFragHandle> fragPtr;
6533             const Uint32 fragId  = signal->theData[variableLen++];
6534             const Uint32 ref = numberToRef(DBLQH,
6535                                            getInstanceKey(req->tableId, fragId),
6536                                            getOwnNodeId());
6537 
6538             DEBUG("Scan build, fragId: " << fragId << ", ref: " << ref);
6539 
6540             if (!ERROR_INSERTED_CLEAR(17004) &&
6541                 likely(m_scanfraghandle_pool.seize(requestPtr.p->m_arena, fragPtr)))
6542             {
6543               fragPtr.p->init(fragId, readBackup);
6544               fragPtr.p->m_treeNodePtrI = treeNodePtr.i;
6545               fragPtr.p->m_ref = ref;
6546               list.addLast(fragPtr);
6547             }
6548             else
6549             {
6550               jam();
6551               err = DbspjErr::OutOfQueryMemory;
6552               return err;
6553             }
6554           }
6555         }
6556         else // 'not getMultiFragFlag(req->requestInfo)'
6557         {
6558           jam();
6559           Ptr<ScanFragHandle> fragPtr;
6560           data.m_fragCount = 1;
6561 
6562           const Uint32 ref =
6563             numberToRef(DBLQH,
6564                         getInstanceKey(req->tableId, req->fragmentNoKeyLen),
6565                         getOwnNodeId());
6566 
6567           if (!ERROR_INSERTED_CLEAR(17004) &&
6568               likely(m_scanfraghandle_pool.seize(requestPtr.p->m_arena, fragPtr)))
6569           {
6570             jam();
6571             fragPtr.p->init(req->fragmentNoKeyLen, readBackup);
6572             fragPtr.p->m_treeNodePtrI = treeNodePtr.i;
6573             fragPtr.p->m_ref = ref;
6574             list.addLast(fragPtr);
6575           }
6576           else
6577           {
6578             jam();
6579             err = DbspjErr::OutOfQueryMemory;
6580             return err;
6581           }
6582         }
6583         requestPtr.p->m_rootFragCnt = data.m_fragCount;
6584       }
6585 
6586       if (ScanFragReq::getRangeScanFlag(req->requestInfo))
6587       {
6588         c_Counters.incr_counter(CI_RANGE_SCANS_RECEIVED, 1);
6589       }
6590       else
6591       {
6592         c_Counters.incr_counter(CI_TABLE_SCANS_RECEIVED, 1);
6593       }
6594     }
6595     else
6596     {
6597       requestPtr.p->m_bits |= Request::RT_NEED_PREPARE;
6598       requestPtr.p->m_bits |= Request::RT_NEED_COMPLETE;
6599 
6600       treeNodePtr.p->m_bits |= TreeNode::T_NEED_PREPARE;
6601       treeNodePtr.p->m_bits |= TreeNode::T_NEED_COMPLETE;
6602       treeNodePtr.p->m_bits |= TreeNode::T_NEED_REPORT_BATCH_COMPLETED;
6603 
6604       dst->tableId = node->tableId;
6605       dst->schemaVersion = node->tableVersion;
6606       dst->fragmentNoKeyLen = 0xff; //Filled in after 'prepare'
6607       dst->savePointId = ctx.m_savepointId;
6608       dst->transId1 = requestPtr.p->m_transId[0];
6609       dst->transId2 = requestPtr.p->m_transId[1];
6610 
6611       Uint32 requestInfo = 0;
6612       ScanFragReq::setReadCommittedFlag(requestInfo, 1);
6613       ScanFragReq::setScanPrio(requestInfo, ctx.m_scanPrio);
6614       ScanFragReq::setRangeScanFlag(requestInfo, 1);
6615       ScanFragReq::setNoDiskFlag(requestInfo,
6616                                  (treeBits & DABits::NI_LINKED_DISK) == 0 &&
6617                                  (paramBits & DABits::PI_DISK_ATTR) == 0);
6618 
6619       if (treeBits & DABits::NI_FIRST_MATCH && treeNodePtr.p->isLeaf())
6620       {
6621         // Can only push firstMatch elimination to data nodes if results does
6622         // not depends of finding matches from children -> has to be a leaf
6623         ScanFragReq::setFirstMatchFlag(requestInfo, 1);
6624       }
6625       if (treeBits & DABits::NI_ANTI_JOIN && treeNodePtr.p->isLeaf())
6626       {
6627         // ANTI_JOIN's cares about whether a match was found or not
6628         // Thus, returning only the first match is sufficient here as well
6629         ScanFragReq::setFirstMatchFlag(requestInfo, 1);
6630       }
6631       dst->requestInfo = requestInfo;
6632     }
6633 
6634     // Common part whether root or not
6635     dst->senderData = treeNodePtr.i;
6636     dst->resultRef  = reference();
6637     dst->resultData = treeNodePtr.i;
6638     ScanFragReq::setCorrFactorFlag(dst->requestInfo, 1);
6639     ScanFragReq::setMultiFragFlag(dst->requestInfo, 0);
6640 
6641     dst->batch_size_rows  = batchRows;
6642     dst->batch_size_bytes = batchBytes;
6643 
6644     ctx.m_scan_cnt++;
6645     ctx.m_scans.set(treeNodePtr.p->m_node_no);
6646 
6647     return 0;
6648   } while (0);
6649 
6650   return err;
6651 }
6652 
6653 Uint32
parseScanFrag(Build_context & ctx,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,DABuffer tree,Uint32 treeBits,DABuffer param,Uint32 paramBits)6654 Dbspj::parseScanFrag(Build_context& ctx,
6655                      Ptr<Request> requestPtr,
6656                      Ptr<TreeNode> treeNodePtr,
6657                      DABuffer tree, Uint32 treeBits,
6658                      DABuffer param, Uint32 paramBits)
6659 {
6660   Uint32 err = 0;
6661 
6662   typedef QN_ScanFragNode Node;
6663   typedef QN_ScanFragParameters Params;
6664 
6665   do
6666   {
6667     jam();
6668 
6669     ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
6670     data.m_fragments.init();
6671     data.m_fragCount = 0;
6672     data.m_frags_outstanding = 0;
6673     data.m_frags_complete = 0;
6674     data.m_frags_not_started = 0;
6675     data.m_parallelismStat.init();
6676     data.m_batch_chunks = 0;
6677 
6678     /**
6679      * We will need to look at the parameters again if the scan is pruned and the prune
6680      * key uses parameter values. Therefore, we keep a reference to the start of the
6681      * parameter buffer.
6682      */
6683     DABuffer origParam = param;
6684     err = parseDA(ctx, requestPtr, treeNodePtr,
6685                   tree, treeBits, param, paramBits);
6686     if (unlikely(err != 0))
6687       break;
6688 
6689     if (treeBits & Node::SF_PRUNE_PATTERN)
6690     {
6691       Uint32 len_cnt = * tree.ptr ++;
6692       Uint32 len = len_cnt & 0xFFFF; // length of pattern in words
6693       Uint32 cnt = len_cnt >> 16;    // no of parameters
6694 
6695       LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
6696       ndbrequire((cnt==0) == ((treeBits & Node::SF_PRUNE_PARAMS) ==0));
6697       ndbrequire((cnt==0) == ((paramBits & Params::SFP_PRUNE_PARAMS)==0));
6698 
6699       if (treeBits & Node::SF_PRUNE_LINKED)
6700       {
6701         jam();
6702         DEBUG("LINKED-PRUNE PATTERN w/ " << cnt << " PARAM values");
6703 
6704         data.m_prunePattern.init();
6705         Local_pattern_store pattern(pool, data.m_prunePattern);
6706 
6707         /**
6708          * Expand pattern into a new pattern (with linked values)
6709          */
6710         err = expand(pattern, treeNodePtr, tree, len, origParam, cnt);
6711         if (unlikely(err != 0))
6712         {
6713           jam();
6714           break;
6715         }
6716         treeNodePtr.p->m_bits |= TreeNode::T_PRUNE_PATTERN;
6717         c_Counters.incr_counter(CI_PRUNED_RANGE_SCANS_RECEIVED, 1);
6718       }
6719       else
6720       {
6721         jam();
6722         DEBUG("FIXED-PRUNE w/ " << cnt << " PARAM values");
6723 
6724         /**
6725          * Expand pattern directly into
6726          *   This means a "fixed" pruning from here on
6727          *   i.e guaranteed single partition
6728          */
6729         Uint32 prunePtrI = RNIL;
6730         bool hasNull;
6731         err = expand(prunePtrI, tree, len, origParam, cnt, hasNull);
6732         if (unlikely(err != 0))
6733         {
6734           jam();
6735           releaseSection(prunePtrI);
6736           break;
6737         }
6738 
6739         if (unlikely(hasNull))
6740         {
6741           /* API should have elliminated requests w/ const-NULL keys */
6742           jam();
6743           DEBUG("BEWARE: T_CONST_PRUNE-key contain NULL values");
6744           releaseSection(prunePtrI);
6745 //        treeNodePtr.p->m_bits |= TreeNode::T_NULL_PRUNE;
6746 //        break;
6747           ndbabort();
6748         }
6749         ndbrequire(prunePtrI != RNIL);  /* todo: can we allow / take advantage of NULLs in range scan? */
6750         data.m_constPrunePtrI = prunePtrI;
6751 
6752         /**
6753          * We may not compute the partition for the hash-key here
6754          *   as we have not yet opened a read-view
6755          */
6756         treeNodePtr.p->m_bits |= TreeNode::T_CONST_PRUNE;
6757         c_Counters.incr_counter(CI_CONST_PRUNED_RANGE_SCANS_RECEIVED, 1);
6758       }
6759     } //SF_PRUNE_PATTERN
6760 
6761     if ((treeNodePtr.p->m_bits & TreeNode::T_CONST_PRUNE) == 0 &&
6762         ((treeBits & Node::SF_PARALLEL) ||
6763          (paramBits & Params::SFP_PARALLEL)))
6764     {
6765       jam();
6766       treeNodePtr.p->m_bits |= TreeNode::T_SCAN_PARALLEL;
6767     }
6768 
6769     if (paramBits & Params::SFP_SORTED_ORDER)
6770     {
6771       jam();
6772       treeNodePtr.p->m_bits |= TreeNode::T_SORTED_ORDER;
6773     }
6774 
6775     return 0;
6776   } while(0);
6777 
6778   jam();
6779   return err;
6780 }
6781 
6782 void
scanFrag_prepare(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)6783 Dbspj::scanFrag_prepare(Signal * signal,
6784                         Ptr<Request> requestPtr, Ptr<TreeNode> treeNodePtr)
6785 {
6786   jam();
6787 
6788   if (!ERROR_INSERTED(17521)) // Avoid emulated rnd errors
6789   {
6790     // ::checkTableError() should be handled before we reach this far
6791     ndbassert(checkTableError(treeNodePtr) == 0); //Handled in Dbspj::start
6792   }
6793   ndbassert(treeNodePtr.p->m_state == TreeNode::TN_BUILDING);
6794   treeNodePtr.p->m_state = TreeNode::TN_PREPARING;
6795 
6796   requestPtr.p->m_outstanding++;
6797 
6798   DihScanTabReq * req = (DihScanTabReq*)signal->getDataPtrSend();
6799   req->tableId = treeNodePtr.p->m_tableOrIndexId;
6800   req->schemaTransId = 0;
6801   req->jamBufferPtr = jamBuffer();
6802 
6803   EXECUTE_DIRECT_MT(DBDIH, GSN_DIH_SCAN_TAB_REQ, signal,
6804                     DihScanTabReq::SignalLength, 0);
6805 
6806   DihScanTabConf * conf = (DihScanTabConf*)signal->getDataPtr();
6807   Uint32 senderData = conf->senderData;
6808   conf->senderData = treeNodePtr.i;
6809   /**
6810    * We need to introduce real-time break here for 2 reasons. The first
6811    * is that it is required by real-time break rules. We can start an
6812    * arbitrary number of prepare scans here. So it is necessary to do a
6813    * real-time break here to ensure that we don't execute for too long
6814    * without real-time breaks.
6815    *
6816    * The second reason is that the caller is looping over the list
6817    * of tree nodes and so we can't change this list while he is
6818    * looping over it. So we introduce a real-time break to ensure that
6819    * the caller only starts up prepare messages and don't actually
6820    * perform all of them.
6821    */
6822   if (senderData == 0)
6823   {
6824     sendSignal(reference(),
6825                GSN_DIH_SCAN_TAB_CONF,
6826                signal,
6827                DihScanTabConf::SignalLength,
6828                JBB);
6829     return;
6830   }
6831   else
6832   {
6833     sendSignal(reference(),
6834                GSN_DIH_SCAN_TAB_REF,
6835                signal,
6836                DihScanTabRef::SignalLength,
6837                JBB);
6838     return;
6839   }
6840 }
6841 
6842 void
execDIH_SCAN_TAB_REF(Signal * signal)6843 Dbspj::execDIH_SCAN_TAB_REF(Signal* signal)
6844 {
6845   jamEntry();
6846   DihScanTabRef * ref = (DihScanTabRef*)signal->getDataPtr();
6847 
6848   Ptr<TreeNode> treeNodePtr;
6849   m_treenode_pool.getPtr(treeNodePtr, ref->senderData);
6850   Ptr<Request> requestPtr;
6851   m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
6852 
6853   ndbrequire(requestPtr.p->isScan());
6854   ndbrequire(requestPtr.p->m_outstanding >= 1);
6855   requestPtr.p->m_outstanding -= 1;
6856   Uint32 errCode = ref->error;
6857   abort(signal, requestPtr, errCode);
6858 }
6859 
6860 void
execDIH_SCAN_TAB_CONF(Signal * signal)6861 Dbspj::execDIH_SCAN_TAB_CONF(Signal* signal)
6862 {
6863   jamEntry();
6864   DihScanTabConf * conf = (DihScanTabConf*)signal->getDataPtr();
6865 
6866   Ptr<TreeNode> treeNodePtr;
6867   m_treenode_pool.getPtr(treeNodePtr, conf->senderData);
6868 
6869   ndbrequire(treeNodePtr.p->m_info == &g_ScanFragOpInfo);
6870 
6871   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
6872 
6873   Uint32 cookie = conf->scanCookie;
6874   Uint32 fragCount = conf->fragmentCount;
6875 
6876   if (conf->reorgFlag)
6877   {
6878     jam();
6879     ScanFragReq * dst = (ScanFragReq*)data.m_scanFragReq;
6880     ScanFragReq::setReorgFlag(dst->requestInfo, ScanFragReq::REORG_NOT_MOVED);
6881   }
6882   if (treeNodePtr.p->m_bits & TreeNode::T_CONST_PRUNE)
6883   {
6884     jam();
6885     fragCount = 1;
6886   }
6887   data.m_fragCount = fragCount;
6888   data.m_scanCookie = cookie;
6889 
6890   const Uint32 prunemask = TreeNode::T_PRUNE_PATTERN | TreeNode::T_CONST_PRUNE;
6891   bool pruned = (treeNodePtr.p->m_bits & prunemask) != 0;
6892 
6893   TableRecordPtr tablePtr;
6894   tablePtr.i = treeNodePtr.p->m_tableOrIndexId;
6895   ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
6896   const bool readBackup =
6897     (tablePtr.p->m_flags & TableRecord::TR_READ_BACKUP) != 0;
6898 
6899   Ptr<Request> requestPtr;
6900   m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
6901   ndbassert(requestPtr.p->m_outstanding > 0);
6902   requestPtr.p->m_outstanding--;
6903 
6904   // Add a skew in the fragment lists such that we don't scan
6905   // the same subset of frags from all SPJ requests in case of
6906   // the scan not being 'T_SCAN_PARALLEL'
6907   Uint16 fragNoOffs = (getOwnNodeId()*requestPtr.p->m_rootFragCnt) % fragCount;
6908   Uint32 err = 0;
6909 
6910   do
6911   {
6912     Ptr<ScanFragHandle> fragPtr;
6913 
6914     /** Allocate & init all 'fragCnt' fragment desriptors */
6915     {
6916       Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
6917 
6918       err = checkTableError(treeNodePtr);
6919       if (unlikely(err != 0))
6920       {
6921         jam();
6922         break;
6923       }
6924       for (Uint32 i = 0; i<fragCount; i++)
6925       {
6926         Ptr<ScanFragHandle> fragPtr;
6927         Uint16 fragNo = (fragNoOffs+i) % fragCount;
6928 
6929         if (!ERROR_INSERTED_CLEAR(17012) &&
6930             likely(m_scanfraghandle_pool.seize(requestPtr.p->m_arena, fragPtr)))
6931         {
6932           jam();
6933           fragPtr.p->init(fragNo, readBackup);
6934           fragPtr.p->m_treeNodePtrI = treeNodePtr.i;
6935           list.addLast(fragPtr);
6936         }
6937         else
6938         {
6939           jam();
6940           err = DbspjErr::OutOfQueryMemory;
6941           goto error;
6942         }
6943       }
6944       list.first(fragPtr); // Needed if T_CONST_PRUNE
6945     } // end 'Alloc scope'
6946 
6947     if (treeNodePtr.p->m_bits & TreeNode::T_CONST_PRUNE)
6948     {
6949       jam();
6950 
6951       // TODO we need a different variant of computeHash here,
6952       // since m_constPrunePtrI does not contain full primary key
6953       // but only parts in distribution key
6954 
6955       BuildKeyReq tmp;
6956       Uint32 tableId = treeNodePtr.p->m_primaryTableId;
6957       err = computePartitionHash(signal, tmp, tableId, data.m_constPrunePtrI);
6958       if (unlikely(err != 0))
6959       {
6960         jam();
6961         break;
6962       }
6963 
6964       releaseSection(data.m_constPrunePtrI);
6965       data.m_constPrunePtrI = RNIL;
6966 
6967       err = getNodes(signal, tmp, tableId);
6968       if (unlikely(err != 0))
6969       {
6970         jam();
6971         break;
6972       }
6973 
6974       fragPtr.p->m_fragId = tmp.fragId;
6975       fragPtr.p->m_ref = tmp.receiverRef;
6976       ndbassert(data.m_fragCount == 1);
6977     }
6978     else if (fragCount == 1)
6979     {
6980       jam();
6981       /**
6982        * This is roughly equivalent to T_CONST_PRUNE
6983        *   pretend that it is const-pruned
6984        */
6985       if (treeNodePtr.p->m_bits & TreeNode::T_PRUNE_PATTERN)
6986       {
6987         jam();
6988         LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
6989         Local_pattern_store pattern(pool, data.m_prunePattern);
6990         pattern.release();
6991       }
6992       data.m_constPrunePtrI = RNIL;
6993       Uint32 clear = TreeNode::T_PRUNE_PATTERN | TreeNode::T_SCAN_PARALLEL;
6994       treeNodePtr.p->m_bits &= ~clear;
6995       treeNodePtr.p->m_bits |= TreeNode::T_CONST_PRUNE;
6996 
6997       /**
6998        * We must get fragPtr.p->m_ref...so set pruned=false
6999        */
7000       pruned = false;
7001     }
7002     data.m_frags_complete = data.m_fragCount;
7003 
7004     if (!pruned)
7005     {
7006       /** Start requesting node info from DIH */
7007       jam();
7008       ndbassert(data.m_frags_outstanding == 0);
7009       data.m_frags_outstanding = data.m_fragCount;
7010       requestPtr.p->m_outstanding++;
7011 
7012       err = scanFrag_sendDihGetNodesReq(signal, requestPtr, treeNodePtr);
7013       if (unlikely(err != 0))
7014       {
7015         jam();
7016         break;
7017       }
7018     }
7019     else
7020     {
7021       jam();
7022       treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
7023     }
7024 
7025     ndbassert(err == 0);
7026     checkPrepareComplete(signal, requestPtr);
7027     return;
7028   } while (0);
7029 
7030 error:
7031   jam();
7032   ndbassert(err != 0);
7033   abort(signal, requestPtr, err);
7034   checkBatchComplete(signal, requestPtr);
7035 }
7036 
7037 /**
7038  * Will check the fragment list for fragments which need to
7039  * get node info to construct 'fragPtr.p->m_ref' from DIH.
7040  *
7041  * In order to avoid CPU starvation, or unmanagable huge FragItem[],
7042  * max MAX_DIH_FRAG_REQS are requested in a single signal.
7043  * If there are more fragments, we have to repeatable call this
7044  * function when CONF for the first fragment set is received.
7045  */
7046 Uint32
scanFrag_sendDihGetNodesReq(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)7047 Dbspj::scanFrag_sendDihGetNodesReq(Signal* signal,
7048                                    Ptr<Request> requestPtr,
7049                                    Ptr<TreeNode> treeNodePtr)
7050 {
7051   jam();
7052   Uint32 err = 0;
7053   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7054   Uint32 tableId = treeNodePtr.p->m_tableOrIndexId;
7055   TableRecordPtr tablePtr;
7056   Ptr<ScanFragHandle> fragPtr;
7057   Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
7058   tablePtr.i = tableId;
7059   ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
7060   Uint32 readAny = tablePtr.p->m_flags & TableRecord::TR_FULLY_REPLICATED ?
7061                    1 : 0;
7062 
7063   ndbassert(data.m_frags_outstanding > 0);
7064 
7065   Uint32 fragCnt = 0;
7066   for (list.first(fragPtr);
7067        !fragPtr.isNull();
7068        list.next(fragPtr))
7069   {
7070     jam();
7071     ndbassert(requestPtr.p->m_outstanding > 0);
7072     ndbassert(data.m_frags_outstanding > 0);
7073 
7074     if (fragCnt >= DiGetNodesReq::MAX_DIGETNODESREQS ||
7075         (ERROR_INSERTED(17131) && fragCnt >= 1))
7076     {
7077       jam();
7078       signal->theData[0] = 3;
7079       signal->theData[1] = treeNodePtr.i;
7080       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
7081       break; //to exit
7082     }
7083 
7084     if (fragPtr.p->m_ref == 0) // Need GSN_DIGETNODRESREQ
7085     {
7086       jam();
7087       DiGetNodesReq * const req = (DiGetNodesReq *)&signal->theData[0];
7088 
7089       req->tableId = treeNodePtr.p->m_tableOrIndexId;
7090       req->hashValue = fragPtr.p->m_fragId;
7091       req->distr_key_indicator = ZTRUE;
7092       req->scan_indicator = ZTRUE;
7093       req->anyNode = readAny;
7094       req->get_next_fragid_indicator = 0;
7095       req->jamBufferPtr = jamBuffer();
7096 
7097       EXECUTE_DIRECT_MT(DBDIH, GSN_DIGETNODESREQ, signal,
7098                         DiGetNodesReq::SignalLength, 0);
7099 
7100       const Uint32 errCode = signal->theData[0];
7101 
7102       if (ERROR_INSERTED_CLEAR(17130) && requestPtr.p->m_outstanding == 1)
7103       {
7104         jamEntry();
7105 	data.m_frags_outstanding = 0;
7106         err= DbspjErr::OutOfSectionMemory;
7107         break;
7108       }
7109       else if (unlikely(errCode))
7110       {
7111         jamEntry();
7112 	data.m_frags_outstanding = 0;
7113         err= errCode;
7114         break;
7115       }
7116 
7117       const DiGetNodesConf * conf = (DiGetNodesConf *)&signal->theData[0];
7118       //if (!errCode)
7119       {
7120         /**
7121          * Get instance key from upper bits except most significant bit which
7122          * is used reorg moving flag.
7123          */
7124         jamEntry();
7125         /* Node cnt from DIH ignores primary, presumably to fit in 2 bits */
7126         Uint32 cnt = (conf->reqinfo & 3) + 1;
7127         Uint32 instanceKey = (conf->reqinfo >> 24) & 127;
7128         NodeId nodeId = conf->nodes[0];
7129         if (nodeId != getOwnNodeId() &&
7130             fragPtr.p->m_readBackup)
7131         {
7132           for (Uint32 i = 1; i < cnt; i++)
7133           {
7134             jam();
7135             if (conf->nodes[i] == getOwnNodeId())
7136             {
7137               jam();
7138               nodeId = getOwnNodeId();
7139               break;
7140             }
7141           }
7142           if (nodeId != getOwnNodeId())
7143           {
7144             Uint32 node;
7145             jam();
7146             if ((node = check_own_location_domain(&conf->nodes[0],
7147                                                   cnt)) != 0)
7148             {
7149               nodeId = node;
7150             }
7151           }
7152         }
7153         fragPtr.p->m_ref = numberToRef(DBLQH, instanceKey, nodeId);
7154         /**
7155          * For Fully replicated tables we can change the fragment id to a local
7156          * fragment as part of DIGETNODESREQ. So set it again here.
7157          */
7158         fragPtr.p->m_fragId = conf->fragId;
7159       }
7160 
7161       fragCnt++;
7162       ndbassert(data.m_frags_outstanding > 0);
7163       ndbassert(treeNodePtr.p->m_state != TreeNode::TN_INACTIVE);
7164       data.m_frags_outstanding--;
7165     }
7166   }
7167   jam();
7168 
7169   if (data.m_frags_outstanding == 0)
7170   {
7171     jam();
7172     treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
7173     requestPtr.p->m_outstanding--;
7174   }
7175   return err;
7176 } //Dbspj::scanFrag_sendDihGetNodesReq
7177 
7178 
7179 void
scanFrag_start(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)7180 Dbspj::scanFrag_start(Signal* signal,
7181                       Ptr<Request> requestPtr,
7182                       Ptr<TreeNode> treeNodePtr)
7183 {
7184   jam();
7185   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7186 
7187   ndbassert(data.m_fragCount > 0);
7188   ndbassert(data.m_frags_outstanding == 0);
7189   ndbassert(data.m_frags_complete == 0);
7190   data.m_frags_not_started = data.m_fragCount;
7191 
7192   ndbassert(treeNodePtr.p->m_state == TreeNode::TN_BUILDING);
7193   treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
7194 
7195   scanFrag_send(signal, requestPtr, treeNodePtr);
7196 }//Dbspj::scanFrag_start
7197 
7198 Uint32
scanFrag_findFrag(Local_ScanFragHandle_list & list,Ptr<ScanFragHandle> & fragPtr,Uint32 fragId)7199 Dbspj::scanFrag_findFrag(Local_ScanFragHandle_list & list,
7200                          Ptr<ScanFragHandle> & fragPtr, Uint32 fragId)
7201 {
7202   for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
7203   {
7204     jam();
7205     if (fragPtr.p->m_fragId == fragId)
7206     {
7207       jam();
7208       return 0;
7209     }
7210   }
7211 
7212   return DbspjErr::IndexFragNotFound;
7213 }
7214 
7215 void
scanFrag_parent_row(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const RowPtr & rowRef)7216 Dbspj::scanFrag_parent_row(Signal* signal,
7217                            Ptr<Request> requestPtr,
7218                            Ptr<TreeNode> treeNodePtr,
7219                            const RowPtr & rowRef)
7220 {
7221   jam();
7222   ndbassert(treeNodePtr.p->m_parentPtrI != RNIL);
7223   DEBUG("::scanFrag_parent_row"
7224      << ", node: " << treeNodePtr.p->m_node_no);
7225 
7226   Uint32 err;
7227   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7228 
7229   /**
7230    * Construct range definition,
7231    *   and if prune pattern enabled
7232    *   stuff it onto correct scanFrag
7233    */
7234   do
7235   {
7236     Ptr<ScanFragHandle> fragPtr;
7237     Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
7238     LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
7239 
7240     err = checkTableError(treeNodePtr);
7241     if (unlikely(err != 0))
7242     {
7243       jam();
7244       break;
7245     }
7246 
7247     if (treeNodePtr.p->m_bits & TreeNode::T_PRUNE_PATTERN)
7248     {
7249       jam();
7250 
7251       /**
7252        * TODO: Expand into linear memory instead
7253        *       of expanding into sections, and then copy
7254        *       section into linear
7255        */
7256       Local_pattern_store pattern(pool, data.m_prunePattern);
7257       Uint32 pruneKeyPtrI = RNIL;
7258       bool hasNull;
7259       err = expand(pruneKeyPtrI, pattern, rowRef, hasNull);
7260       if (unlikely(err != 0))
7261       {
7262         jam();
7263         releaseSection(pruneKeyPtrI);
7264         break;
7265       }
7266 
7267       if (unlikely(hasNull))
7268       {
7269         jam();
7270         DEBUG("T_PRUNE_PATTERN-key contain NULL values");
7271 
7272         // Ignore this request as 'NULL == <column>' will never give a match
7273         releaseSection(pruneKeyPtrI);
7274         return;  // Bailout, SCANREQ would have returned 0 rows anyway
7275       }
7276 
7277       BuildKeyReq tmp;
7278       Uint32 tableId = treeNodePtr.p->m_primaryTableId;
7279       err = computePartitionHash(signal, tmp, tableId, pruneKeyPtrI);
7280       releaseSection(pruneKeyPtrI);
7281       if (unlikely(err != 0))
7282       {
7283         jam();
7284         break;
7285       }
7286 
7287       err = getNodes(signal, tmp, tableId);
7288       if (unlikely(err != 0))
7289       {
7290         jam();
7291         break;
7292       }
7293 
7294       err = scanFrag_findFrag(list, fragPtr, tmp.fragId);
7295       if (unlikely(err != 0))
7296       {
7297         DEBUG_CRASH();
7298         break;
7299       }
7300 
7301       /**
7302        * NOTE: We can get different receiverRef's here
7303        *       for different keys. E.g during node-recovery where
7304        *       primary-fragment is switched.
7305        *
7306        *       Use latest that we receive
7307        *
7308        * TODO: Also double check table-reorg
7309        */
7310       fragPtr.p->m_ref = tmp.receiverRef;
7311     }
7312     else
7313     {
7314       jam();
7315       /**
7316        * If const prune, or no-prune, store on first fragment,
7317        * and send to 1 or all resp.
7318        */
7319       list.first(fragPtr);
7320     }
7321 
7322     if (treeNodePtr.p->m_bits & TreeNode::T_KEYINFO_CONSTRUCTED)
7323     {
7324       jam();
7325       Local_pattern_store pattern(pool, treeNodePtr.p->m_keyPattern);
7326 
7327       /**
7328        * Test execution terminated due to 'OutOfSectionMemory':
7329        * - 17060: Fail on scanFrag_parent_row at first call
7330        * - 17061: Fail on scanFrag_parent_row if 'isLeaf'
7331        * - 17062: Fail on scanFrag_parent_row if treeNode not root
7332        * - 17063: Fail on scanFrag_parent_row at a random node of the query tree
7333        */
7334       if (ERROR_INSERTED(17060) ||
7335          (ERROR_INSERTED(17061) && (treeNodePtr.p->isLeaf())) ||
7336          (ERROR_INSERTED(17062) && (treeNodePtr.p->m_parentPtrI != RNIL)) ||
7337          (ERROR_INSERTED(17063) && (rand() % 7) == 0))
7338       {
7339         jam();
7340         CLEAR_ERROR_INSERT_VALUE;
7341         ndbout_c("Injecting OutOfSectionMemory error at line %d file %s",
7342                  __LINE__,  __FILE__);
7343         err = DbspjErr::OutOfSectionMemory;
7344         break;
7345       }
7346 
7347       bool hasNull = false;
7348       Uint32 keyPtrI = RNIL;
7349       err = expand(keyPtrI, pattern, rowRef, hasNull);
7350       if (unlikely(err != 0))
7351       {
7352         jam();
7353         break;
7354       }
7355       if (hasNull)
7356       {
7357         jam();
7358         DEBUG("Key contain NULL values, ignoring it");
7359         DBUG_ASSERT((treeNodePtr.p->m_bits & TreeNode::T_ONE_SHOT) == 0);
7360         // Ignore this request as 'NULL == <column>' will never give a match
7361         releaseSection(keyPtrI);
7362         return;  // Bailout, SCANREQ would have returned 0 rows anyway
7363       }
7364       scanFrag_fixupBound(fragPtr, keyPtrI, rowRef.m_src_correlation);
7365 
7366       SectionReader key(keyPtrI, getSectionSegmentPool());
7367       err = appendReaderToSection(fragPtr.p->m_rangePtrI, key, key.getSize());
7368       releaseSection(keyPtrI);
7369       if (unlikely(err != 0))
7370       {
7371         jam();
7372         break;
7373       }
7374     }
7375     else
7376     {
7377       jam();
7378       // Fixed key...fix later...
7379       ndbabort();
7380     }
7381 
7382     if (treeNodePtr.p->m_bits & TreeNode::T_ONE_SHOT)
7383     {
7384       jam();
7385       /**
7386        * We being a T_ONE_SHOT means that we're only be called
7387        *   with parent_row once, i.e batch is complete
7388        */
7389       scanFrag_parent_batch_complete(signal, requestPtr, treeNodePtr);
7390     }
7391 
7392     return;
7393   } while (0);
7394 
7395   ndbrequire(err);
7396   jam();
7397   abort(signal, requestPtr, err);
7398 }
7399 
7400 
7401 void
scanFrag_fixupBound(Ptr<ScanFragHandle> fragPtr,Uint32 ptrI,Uint32 corrVal)7402 Dbspj::scanFrag_fixupBound(Ptr<ScanFragHandle> fragPtr,
7403                            Uint32 ptrI, Uint32 corrVal)
7404 {
7405   /**
7406    * Index bounds...need special tender and care...
7407    *
7408    * 1) Set #bound no, bound-size, and renumber attributes
7409    */
7410   SectionReader r0(ptrI, getSectionSegmentPool());
7411   const Uint32 boundsz = r0.getSize();
7412 
7413   Uint32 tmp;
7414   ndbrequire(r0.peekWord(&tmp));
7415   ndbassert((corrVal & 0xFFFF) < MaxCorrelationId);
7416   tmp |= (boundsz << 16) | ((corrVal & 0xFFF) << 4);
7417   ndbrequire(r0.updateWord(tmp));
7418   ndbrequire(r0.step(1));    // Skip first BoundType
7419 
7420   // Note: Renumbering below assume there are only EQ-bounds !!
7421   Uint32 id = 0;
7422   Uint32 len32;
7423   do
7424   {
7425     ndbrequire(r0.peekWord(&tmp));
7426     AttributeHeader ah(tmp);
7427     const Uint32 len = ah.getByteSize();
7428     AttributeHeader::init(&tmp, id++, len);
7429     ndbrequire(r0.updateWord(tmp));
7430     len32 = (len + 3) >> 2;
7431   } while (r0.step(2 + len32));  // Skip AttributeHeader(1) + Attribute(len32) + next BoundType(1)
7432 }
7433 
7434 void
scanFrag_parent_batch_complete(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)7435 Dbspj::scanFrag_parent_batch_complete(Signal* signal,
7436                                       Ptr<Request> requestPtr,
7437                                       Ptr<TreeNode> treeNodePtr)
7438 {
7439   jam();
7440   ndbassert(treeNodePtr.p->m_parentPtrI != RNIL);
7441   ndbassert(treeNodePtr.p->m_state == TreeNode::TN_INACTIVE);
7442 
7443   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7444   ndbassert(data.m_frags_complete == data.m_fragCount);
7445 
7446   /**
7447    * Update the fragments 'm_state' and the aggregated TreeNode::m_frag_*
7448    * counters to reflect which fragments we should now start scanning.
7449    * NOTE: 'm_state' is not maintained if all 'complete' - node becomes
7450    * inactive
7451    */
7452   {
7453     Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
7454     Ptr<ScanFragHandle> fragPtr;
7455     list.first(fragPtr);
7456     data.m_frags_complete = 0;
7457 
7458     if ((treeNodePtr.p->m_bits & TreeNode::T_PRUNE_PATTERN) == 0)
7459     {
7460       /* No pruning, first fragment in list contains any range info */
7461       if (fragPtr.p->m_rangePtrI != RNIL)
7462       {
7463         /* All fragments to be scanned with range info */
7464         while(!fragPtr.isNull())
7465         {
7466           ndbassert(fragPtr.p->m_state == ScanFragHandle::SFH_NOT_STARTED ||
7467                     fragPtr.p->m_state == ScanFragHandle::SFH_COMPLETE);
7468           fragPtr.p->m_state = ScanFragHandle::SFH_NOT_STARTED;
7469           list.next(fragPtr);
7470         }
7471       }
7472       else
7473       {
7474         /* No range info therefore empty result set. */
7475         jam();
7476         data.m_frags_complete = data.m_fragCount;
7477       }
7478     }
7479     else
7480     {
7481       /* Per fragment pruning, mark and count pruned-out
7482        * (rangeless) fragments as completed
7483        */
7484       while(!fragPtr.isNull())
7485       {
7486         fragPtr.p->m_state = ScanFragHandle::SFH_NOT_STARTED;
7487         if (fragPtr.p->m_rangePtrI == RNIL)
7488         {
7489           jam();
7490           /**
7491            * This is a pruned scan, so we only scan those fragments that
7492            * some distribution key hashed to.
7493            */
7494           fragPtr.p->m_state = ScanFragHandle::SFH_COMPLETE;
7495           data.m_frags_complete++;
7496         }
7497         list.next(fragPtr);
7498       }
7499     }
7500     data.m_frags_not_started = data.m_fragCount - data.m_frags_complete;
7501   }
7502 
7503   if (data.m_frags_complete == data.m_fragCount)
7504   {
7505     jam();
7506     /**
7507      * No keys was produced...
7508      */
7509     return;
7510   }
7511 
7512   /**
7513    * When parent's batch is complete, we send our batch
7514    */
7515   scanFrag_send(signal, requestPtr, treeNodePtr);
7516 }
7517 
7518 
7519 void
scanFrag_send(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)7520 Dbspj::scanFrag_send(Signal* signal,
7521                      Ptr<Request> requestPtr,
7522                      Ptr<TreeNode> treeNodePtr)
7523 {
7524   jam();
7525   ndbassert(treeNodePtr.p->m_state == TreeNode::TN_INACTIVE);
7526   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7527   ndbassert(data.m_frags_outstanding == 0);
7528   ndbassert(data.m_frags_not_started == (data.m_fragCount - data.m_frags_complete));
7529 
7530   const ScanFragReq * org = (const ScanFragReq*)data.m_scanFragReq;
7531   ndbrequire(org->batch_size_rows > 0);
7532 
7533   data.m_firstBatch = true;
7534   if (treeNodePtr.p->m_bits & TreeNode::T_SCAN_PARALLEL)
7535   {
7536     jam();
7537     data.m_parallelism = MIN(data.m_frags_not_started, org->batch_size_rows);
7538   }
7539   else if (!data.m_parallelismStat.isValid())
7540   {
7541     /**
7542      * No valid statistics yet to estimate 'parallism' from. We start
7543      * by reading a few fragments, but suffient many to take full advantage
7544      * of scan parallelism. Batch completion will provide a parallelism sample,
7545      * such that we can do a better parallelism guess next time.
7546      * Note that SCAN_FRAGCONF may start more scans when this scan completes,
7547      * if there are a sufficient amount of unused batch size left.
7548      */
7549     jam();
7550     data.m_parallelism = MIN(requestPtr.p->m_rootFragCnt,
7551                              data.m_frags_not_started);
7552   }
7553   else
7554   {
7555     jam();
7556     /**
7557      * Use statistics from earlier runs of this operation to estimate the
7558      * initial parallelism. We use the mean minus two times the standard
7559      * deviation to have a low risk of setting parallelism to high (as erring
7560      * in the other direction is more costly).
7561      */
7562     Int32 parallelism =
7563       static_cast<Int32>(MIN(data.m_parallelismStat.getMean() +
7564                              // Add 0.5 to get proper rounding.
7565                              - 2 * data.m_parallelismStat.getStdDev() + 0.5,
7566                              org->batch_size_rows));
7567 
7568     if (parallelism < static_cast<Int32>(requestPtr.p->m_rootFragCnt))
7569     {
7570       jam();
7571       parallelism = MIN(requestPtr.p->m_rootFragCnt, data.m_frags_not_started);
7572     }
7573     else if (data.m_frags_not_started % parallelism != 0)
7574     {
7575       jam();
7576       /**
7577        * Set parallelism such that we can expect to have similar
7578        * parallelism in each batch. For example if there are 8 remaining
7579        * fragments, then we should fetch 2 times 4 fragments rather than
7580        * 7+1.
7581        */
7582       const Int32 roundTrips = 1 + data.m_frags_not_started / parallelism;
7583       parallelism = data.m_frags_not_started / roundTrips;
7584     }
7585 
7586     // Allow higher parallelism to avoid 'rows' capped by MAX_PARALLEL_OP_PER_SCAN
7587     if ((org->batch_size_rows / parallelism) > MAX_PARALLEL_OP_PER_SCAN)
7588     {
7589       jam();
7590       parallelism = MIN((org->batch_size_rows + MAX_PARALLEL_OP_PER_SCAN-1)
7591                         / MAX_PARALLEL_OP_PER_SCAN,
7592                         data.m_frags_not_started);
7593     }
7594 
7595     ndbassert(parallelism >= 1);
7596     ndbassert((Uint32)parallelism + data.m_frags_complete <= data.m_fragCount);
7597     data.m_parallelism = static_cast<Uint32>(parallelism);
7598 
7599 #ifdef DEBUG_SCAN_FRAGREQ
7600     DEBUG("::scanFrag_send(), starting fragment scan with parallelism="
7601           << data.m_parallelism);
7602 #endif
7603   }
7604   ndbrequire(data.m_parallelism > 0);
7605 
7606   // Cap batchSize-rows to avoid exceeding MAX_PARALLEL_OP_PER_SCAN
7607   const Uint32 bs_rows = MIN(org->batch_size_rows / data.m_parallelism,
7608                              MAX_PARALLEL_OP_PER_SCAN);
7609   const Uint32 bs_bytes = org->batch_size_bytes / data.m_parallelism;
7610   ndbassert(bs_rows > 0);
7611   ndbassert(bs_bytes > 0);
7612 
7613   data.m_rows_received = 0;
7614   data.m_rows_expecting = 0;
7615   data.m_largestBatchRows = 0;
7616   data.m_largestBatchBytes = 0;
7617   data.m_totalRows = 0;
7618   data.m_totalBytes = 0;
7619 
7620   Uint32 batchRange = 0;
7621   Uint32 frags_started =
7622     scanFrag_send(signal,
7623                    requestPtr,
7624                    treeNodePtr,
7625                    data.m_parallelism,
7626                    bs_bytes,
7627                    bs_rows,
7628                    batchRange);
7629 
7630   /**
7631    * scanFrag_send might fail to send (errors?):
7632    * Check that we really did send something before
7633    * updating outstanding & active.
7634    */
7635   if (likely(frags_started > 0))
7636   {
7637     jam();
7638     ndbrequire(static_cast<Uint32>(data.m_frags_outstanding +
7639                                    data.m_frags_complete) <=
7640                data.m_fragCount);
7641 
7642     data.m_batch_chunks = 1;
7643     requestPtr.p->m_cnt_active++;
7644     requestPtr.p->m_outstanding++;
7645     requestPtr.p->m_completed_tree_nodes.clear(treeNodePtr.p->m_node_no);
7646     treeNodePtr.p->m_state = TreeNode::TN_ACTIVE;
7647   }
7648 }
7649 
7650 /**
7651  * Ask for the first batch for a number of fragments.
7652  *
7653  * Returns how many fragments we did request the
7654  * 'first batch' from. (<= noOfFrags)
7655  */
7656 Uint32
scanFrag_send(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 noOfFrags,Uint32 bs_bytes,Uint32 bs_rows,Uint32 & batchRange)7657 Dbspj::scanFrag_send(Signal* signal,
7658                      Ptr<Request> requestPtr,
7659                      Ptr<TreeNode> treeNodePtr,
7660                      Uint32 noOfFrags,
7661                      Uint32 bs_bytes,
7662                      Uint32 bs_rows,
7663                      Uint32& batchRange)
7664 {
7665   jam();
7666   ndbassert(bs_bytes > 0);
7667   ndbassert(bs_rows > 0);
7668   ndbassert(bs_rows <= bs_bytes);
7669   /**
7670    * if (m_bits & prunemask):
7671    * - Range keys sliced out to each ScanFragHandle
7672    * - Else, range keys kept on first (and only) ScanFragHandle
7673    */
7674   const bool prune = treeNodePtr.p->m_bits &
7675     (TreeNode::T_PRUNE_PATTERN | TreeNode::T_CONST_PRUNE);
7676 
7677   /**
7678    * If scan is repeatable, we must make sure not to release range keys so
7679    * that we can use them again in the next repetition.
7680    */
7681   const bool repeatable =
7682     (treeNodePtr.p->m_bits & TreeNode::T_SCAN_REPEATABLE) != 0;
7683 
7684   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7685   ndbassert(noOfFrags > 0);
7686   ndbassert(noOfFrags <= data.m_frags_not_started);
7687   ScanFragReq* const req =
7688     reinterpret_cast<ScanFragReq*>(signal->getDataPtrSend());
7689   const ScanFragReq * const org
7690     = reinterpret_cast<ScanFragReq*>(data.m_scanFragReq);
7691 
7692   memcpy(req, org, sizeof(data.m_scanFragReq));
7693   // req->variableData[0] // set below
7694   req->variableData[1] = requestPtr.p->m_rootResultData;
7695   req->batch_size_bytes = bs_bytes;
7696   req->batch_size_rows = MIN(bs_rows,MAX_PARALLEL_OP_PER_SCAN);
7697 
7698   Uint32 requestsSent = 0;
7699   Uint32 err = checkTableError(treeNodePtr);
7700   if (likely(err == 0))
7701   {
7702     Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
7703     Ptr<ScanFragHandle> fragPtr;
7704     list.first(fragPtr);
7705 
7706     /**
7707      * Iterate over the list of fragments until we have sent as many
7708      * SCAN_FRAGREQs as we should.
7709      */
7710     while (requestsSent < noOfFrags)
7711     {
7712       jam();
7713       ndbassert(!fragPtr.isNull());
7714       /**
7715        * There is a 12-bit implementation limit on how large
7716        * the 'parent-row-correlation-id' may be. Thus, if rows
7717        * from this scan may be 'parents', number of rows in batch
7718        * should not exceed what could be represented in 12 bits.
7719        * See also Dbspj::scanFrag_fixupBound()
7720        */
7721       ndbassert(treeNodePtr.p->isLeaf() ||
7722                 batchRange+bs_rows <= MaxCorrelationId);
7723 
7724       if (fragPtr.p->m_state != ScanFragHandle::SFH_NOT_STARTED)
7725       {
7726         // Skip forward to the frags that we should send.
7727         jam();
7728         list.next(fragPtr);
7729         continue;
7730       }
7731 
7732       const Uint32 ref = fragPtr.p->m_ref;
7733 
7734       if (noOfFrags==1 && !prune &&
7735           data.m_frags_not_started == data.m_fragCount &&
7736           refToNode(ref) != getOwnNodeId() &&
7737           list.hasNext(fragPtr))
7738       {
7739         /**
7740          * If we are doing a scan with adaptive parallelism and start with
7741          * parallelism=1 then it makes sense to fetch a batch from a fragment on
7742          * the local data node. The reason for this is that if that fragment
7743          * contains few rows, we may be able to read from several fragments in
7744          * parallel. Then we minimize the total number of round trips (to remote
7745          * data nodes) if we fetch the first fragment batch locally.
7746          */
7747         jam();
7748         list.next(fragPtr);
7749         continue;
7750       }
7751 
7752       /**
7753        * Set data specific for this fragment
7754        */
7755       req->senderData = fragPtr.i;
7756       req->fragmentNoKeyLen = fragPtr.p->m_fragId;
7757       req->variableData[0] = batchRange;
7758 
7759       /**
7760        * Set up the key-/attrInfo to be sent with the SCAN_FRAGREQ.
7761        * Determine whether these should released as part of the
7762        * send or not. We try to 'release' whenever possible in order
7763        * to avoid copying them when sent locally. However, we need
7764        * to make sure that the key/attr will not be reused before
7765        * they can be released. Note:
7766        *
7767        * - Only the rootNode is ONE_SHOT.
7768        * - keyInfo comes from either m_send.m_keyInfoPtrI or
7769        *   fragPtr.p->m_rangePtrI (not both! - 'XOR').
7770        * - If the child scan is pruned, a seperate 'rangePtr' is
7771        *   build for each frag - Non-pruned scan store the 'rangePtr'
7772        *   in the first frag, which is reused for all the frags.
7773        * - Child nodes can possibly be 'repeatable', which implies
7774        *   that m_rangePtrI can't be released yet.
7775        * - attrInfo is always taken from m_send.m_attrInfoPtrI, and
7776        *   is reused from all frag scans, either repeated or not!
7777        *
7778        * Note the somewhat different lifetime of key- vs attrInfo:
7779        * Except for the ONE_SHOT rootNode, the attrInfo always has
7780        * to be kept longer than 'key' before released.
7781        * As sendSignal() either release both or none, we can't
7782        * set 'releaseAtSend' to suite both key- and attrInfo
7783        * lifetime.
7784        *
7785        * Thus, we set 'releaseAtSend' to suite the shorter lifecycle
7786        * of the 'range' keys. attrInfo is duplicated whenever needed
7787        * such that a copy can be released together with the keyInfo.
7788        */
7789       Uint32 attrInfoPtrI = treeNodePtr.p->m_send.m_attrInfoPtrI;
7790       Uint32 keyInfoPtrI = treeNodePtr.p->m_send.m_keyInfoPtrI;
7791       bool releaseAtSend = false;
7792 
7793       if (treeNodePtr.p->m_bits & TreeNode::T_ONE_SHOT &&
7794           data.m_frags_not_started==1)
7795       {
7796         jam();
7797         ndbassert(!repeatable);
7798         ndbassert(fragPtr.p->m_rangePtrI == RNIL);
7799         /**
7800          * Pass sections to send and release them (root only)
7801          */
7802         treeNodePtr.p->m_send.m_attrInfoPtrI = RNIL;
7803         treeNodePtr.p->m_send.m_keyInfoPtrI = RNIL;
7804         releaseAtSend = true;
7805       }
7806       else
7807       {
7808         jam();
7809         Ptr<ScanFragHandle> fragWithRangePtr;
7810         if (prune)
7811         {
7812           jam();
7813           fragWithRangePtr = fragPtr;
7814           releaseAtSend = !repeatable;
7815         }
7816         else
7817         {
7818           /**
7819            * Note: if not 'prune', keyInfo is only set in first fragPtr,
7820            *   even if it is valid for all of them. (save some mem.)
7821            */
7822           jam();
7823           list.first(fragWithRangePtr);
7824           releaseAtSend = (!repeatable && data.m_frags_not_started==1);
7825         }
7826         if (fragWithRangePtr.p->m_rangePtrI != RNIL)
7827         {
7828           ndbassert(keyInfoPtrI == RNIL);  //Not both keyInfo and 'range'
7829           keyInfoPtrI = fragWithRangePtr.p->m_rangePtrI;
7830         }
7831         /**
7832          * 'releaseAtSend' is set above based on the keyInfo lifetime.
7833          * Copy the attrInfo (comment above) whenever needed.
7834          */
7835         if (releaseAtSend)
7836         {
7837           jam();
7838           /**
7839            * Test execution terminated due to 'OutOfSectionMemory' which
7840            * may happen for different treeNodes in the request:
7841            * - 17090: Fail on any scanFrag_send()
7842            * - 17091: Fail after sending SCAN_FRAGREQ to some fragments
7843            * - 17092: Fail on scanFrag_send() if 'isLeaf'
7844            * - 17093: Fail on scanFrag_send() if treeNode not root
7845            */
7846           if (ERROR_INSERTED(17090) ||
7847              (ERROR_INSERTED(17091) && requestsSent > 1) ||
7848              (ERROR_INSERTED(17092) && treeNodePtr.p->isLeaf()) ||
7849              (ERROR_INSERTED(17093) && treeNodePtr.p->m_parentPtrI != RNIL))
7850           {
7851             jam();
7852             CLEAR_ERROR_INSERT_VALUE;
7853             ndbout_c("Injecting OutOfSectionMemory error at line %d file %s",
7854                      __LINE__,  __FILE__);
7855             err = DbspjErr::OutOfSectionMemory;
7856             break;
7857           }
7858           Uint32 tmp = RNIL;
7859           if (!dupSection(tmp, attrInfoPtrI))
7860           {
7861             jam();
7862             ndbassert(tmp == RNIL);  // Guard for memleak
7863             err = DbspjErr::OutOfSectionMemory;
7864             break;
7865           }
7866           attrInfoPtrI = tmp;
7867 
7868           /** Reflect the release of the keyInfo 'range' set above */
7869           fragWithRangePtr.p->m_rangePtrI = RNIL;
7870         } //if (releaseAtSend)
7871       }
7872 
7873       SectionHandle handle(this);
7874       getSection(handle.m_ptr[0], attrInfoPtrI);
7875       handle.m_cnt = 1;
7876       if (keyInfoPtrI != RNIL)
7877       {
7878         jam();
7879         getSection(handle.m_ptr[1], keyInfoPtrI);
7880         handle.m_cnt++;
7881       }
7882 
7883 #if defined DEBUG_SCAN_FRAGREQ
7884       ndbout_c("SCAN_FRAGREQ to %x", ref);
7885       printSCAN_FRAGREQ(stdout, signal->getDataPtrSend(),
7886                         NDB_ARRAY_SIZE(treeNodePtr.p->m_scanFrag_data.m_scanFragReq),
7887                         DBLQH);
7888       printf("ATTRINFO: ");
7889       print(handle.m_ptr[0], stdout);
7890       if (handle.m_cnt > 1)
7891       {
7892         printf("KEYINFO: ");
7893         print(handle.m_ptr[1], stdout);
7894       }
7895 #endif
7896 
7897       if (!ScanFragReq::getRangeScanFlag(req->requestInfo))
7898       {
7899         c_Counters.incr_counter(CI_LOCAL_TABLE_SCANS_SENT, 1);
7900       }
7901       else if (refToNode(ref) == getOwnNodeId())
7902       {
7903         c_Counters.incr_counter(CI_LOCAL_RANGE_SCANS_SENT, 1);
7904       }
7905       else
7906       {
7907         ndbrequire(!ERROR_INSERTED(17014));
7908         c_Counters.incr_counter(CI_REMOTE_RANGE_SCANS_SENT, 1);
7909       }
7910 
7911       /**
7912        * For a non-repeatable pruned scan, key info is unique for each
7913        * fragment and therefore cannot be reused, so we release key info
7914        * right away.
7915        */
7916 
7917       if (ERROR_INSERTED(17110) ||
7918          (ERROR_INSERTED(17111) && treeNodePtr.p->isLeaf()) ||
7919          (ERROR_INSERTED(17112) && treeNodePtr.p->m_parentPtrI != RNIL))
7920       {
7921         jam();
7922         CLEAR_ERROR_INSERT_VALUE;
7923         ndbout_c("Injecting invalid schema version error at line %d file %s",
7924                  __LINE__,  __FILE__);
7925         // Provoke 'Invalid schema version' in order to receive SCAN_FRAGREF
7926         req->schemaVersion++;
7927       }
7928 
7929       /**
7930        * To reduce the copy burden we want to keep hold of the
7931        * AttrInfo and KeyInfo sections after sending them to
7932        * LQH.  To do this we perform the fragmented send inline,
7933        * so that all fragments are sent *now*.  This avoids any
7934        * problems with the fragmented send CONTINUE 'thread' using
7935        * the section while we hold or even release it.  The
7936        * signal receiver can still take realtime breaks when
7937        * receiving.
7938        *
7939        * Indicate to sendBatchedFragmentedSignal that we want to
7940        * keep the fragments, so it must not free them, unless this
7941        * is the last request in which case they can be freed. If
7942        * the last request is a local send then a copy is avoided.
7943        */
7944       {
7945         jam();
7946         sendBatchedFragmentedSignal(ref,
7947                                     GSN_SCAN_FRAGREQ,
7948                                     signal,
7949                                     NDB_ARRAY_SIZE(data.m_scanFragReq),
7950                                     JBB,
7951                                     &handle,
7952                                     !releaseAtSend); //Keep sent sections,
7953                                                      //unless last send
7954       }
7955 
7956       if (releaseAtSend)
7957       {
7958         ndbassert(handle.m_cnt == 0);
7959       }
7960       handle.clear();
7961 
7962       fragPtr.p->m_state = ScanFragHandle::SFH_SCANNING; // running
7963       data.m_frags_outstanding++;
7964       data.m_frags_not_started--;
7965       batchRange += bs_rows;
7966       requestsSent++;
7967       list.next(fragPtr);
7968     } // while (requestsSent < noOfFrags)
7969   }
7970   if (err)
7971   {
7972     jam();
7973     abort(signal, requestPtr, err);
7974   }
7975 
7976   return requestsSent;
7977 }
7978 
7979 void
scanFrag_parent_batch_repeat(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)7980 Dbspj::scanFrag_parent_batch_repeat(Signal* signal,
7981                                      Ptr<Request> requestPtr,
7982                                      Ptr<TreeNode> treeNodePtr)
7983 {
7984   jam();
7985   ndbassert(treeNodePtr.p->m_parentPtrI != RNIL);
7986 
7987   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7988 
7989   DEBUG("scanFrag_parent_batch_repeat(), m_node_no: " << treeNodePtr.p->m_node_no
7990         << ", m_batch_chunks: " << data.m_batch_chunks);
7991 
7992   ndbassert(treeNodePtr.p->m_bits & TreeNode::T_SCAN_REPEATABLE);
7993 
7994   /**
7995    * Register fragment-scans to be restarted if we didn't get all
7996    * previously fetched parent related child rows in a single batch.
7997    */
7998   if (data.m_batch_chunks > 1)
7999   {
8000     jam();
8001     DEBUG("Register TreeNode for restart, m_node_no: " << treeNodePtr.p->m_node_no);
8002     ndbrequire(treeNodePtr.p->m_state != TreeNode::TN_ACTIVE);
8003     registerActiveCursor(requestPtr, treeNodePtr);
8004     data.m_batch_chunks = 0;
8005   }
8006 }
8007 
8008 void
scanFrag_countSignal(const Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 cnt)8009 Dbspj::scanFrag_countSignal(const Signal* signal,
8010                        Ptr<Request> requestPtr,
8011                        Ptr<TreeNode> treeNodePtr,
8012                        Uint32 cnt)
8013 {
8014   jam();
8015   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8016   data.m_rows_received += cnt;
8017 
8018   if (data.m_frags_outstanding == 0 &&
8019       data.m_rows_received == data.m_rows_expecting)
8020   {
8021     jam();
8022     ndbassert(requestPtr.p->m_outstanding > 0);
8023     requestPtr.p->m_outstanding--;
8024 
8025     // We have received all rows for this treeNode in this batch.
8026     requestPtr.p->m_completed_tree_nodes.set(treeNodePtr.p->m_node_no);
8027   }
8028 }
8029 
8030 void
scanFrag_execSCAN_FRAGCONF(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Ptr<ScanFragHandle> fragPtr)8031 Dbspj::scanFrag_execSCAN_FRAGCONF(Signal* signal,
8032                                   Ptr<Request> requestPtr,
8033                                   Ptr<TreeNode> treeNodePtr,
8034                                   Ptr<ScanFragHandle> fragPtr)
8035 {
8036   jam();
8037 
8038   const ScanFragConf * conf = (const ScanFragConf*)(signal->getDataPtr());
8039 
8040   Uint32 rows = conf->completedOps;
8041   Uint32 done = conf->fragmentCompleted;
8042   Uint32 bytes = conf->total_len * sizeof(Uint32);
8043 
8044   Uint32 state = fragPtr.p->m_state;
8045   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8046 
8047   if (state == ScanFragHandle::SFH_WAIT_CLOSE && done == 0)
8048   {
8049     jam();
8050     /**
8051      * We sent an explicit close request...ignore this...a close will come later
8052      */
8053     return;
8054   }
8055 
8056   requestPtr.p->m_rows += rows;
8057   data.m_totalRows += rows;
8058   data.m_totalBytes += bytes;
8059   data.m_largestBatchRows = MAX(data.m_largestBatchRows, rows);
8060   data.m_largestBatchBytes = MAX(data.m_largestBatchBytes, bytes);
8061 
8062   if (treeNodePtr.p->m_bits & TreeNode::T_EXPECT_TRANSID_AI)
8063   {
8064     jam();
8065     data.m_rows_expecting += rows;
8066   }
8067 
8068   ndbrequire(data.m_frags_outstanding);
8069   ndbrequire(state == ScanFragHandle::SFH_SCANNING ||
8070              state == ScanFragHandle::SFH_WAIT_CLOSE);
8071 
8072   data.m_frags_outstanding--;
8073   fragPtr.p->m_state = ScanFragHandle::SFH_WAIT_NEXTREQ;
8074 
8075   if (done)
8076   {
8077     jam();
8078     fragPtr.p->m_state = ScanFragHandle::SFH_COMPLETE;
8079     ndbrequire(data.m_frags_complete < data.m_fragCount);
8080     data.m_frags_complete++;
8081 
8082     if (data.m_frags_complete == data.m_fragCount ||
8083         ((requestPtr.p->m_state & Request::RS_ABORTING) != 0 &&
8084          data.m_fragCount == (data.m_frags_complete + data.m_frags_not_started)))
8085     {
8086       jam();
8087       ndbrequire(requestPtr.p->m_cnt_active);
8088       requestPtr.p->m_cnt_active--;
8089       treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
8090     }
8091   }
8092 
8093   if (data.m_frags_outstanding == 0)
8094   {
8095     const bool isFirstBatch = data.m_firstBatch;
8096     data.m_firstBatch = false;
8097 
8098     const ScanFragReq * const org
8099       = reinterpret_cast<const ScanFragReq*>(data.m_scanFragReq);
8100 
8101     if (data.m_frags_complete == data.m_fragCount)
8102     {
8103       jam();
8104       /**
8105        * Calculate what would have been the optimal parallelism for the
8106        * scan instance that we have just completed, and update
8107        * 'parallelismStat' with this value. We then use this statistics to set
8108        * the initial parallelism for the next instance of this operation.
8109        */
8110       double parallelism = data.m_fragCount;
8111       if (data.m_totalRows > 0)
8112       {
8113         parallelism = MIN(parallelism,
8114                           double(org->batch_size_rows) * data.m_fragCount
8115                           / data.m_totalRows);
8116       }
8117       if (data.m_totalBytes > 0)
8118       {
8119         parallelism = MIN(parallelism,
8120                           double(org->batch_size_bytes) * data.m_fragCount
8121                           / data.m_totalBytes);
8122       }
8123       data.m_parallelismStat.sample(parallelism);
8124     }
8125 
8126     /**
8127      * Don't continue scan if we're aborting...
8128      */
8129     ndbassert(state != ScanFragHandle::SFH_WAIT_CLOSE ||
8130               (requestPtr.p->m_state & Request::RS_ABORTING));
8131 
8132     if (state == ScanFragHandle::SFH_SCANNING &&
8133         isFirstBatch && data.m_frags_not_started > 0)
8134     {
8135       jam();
8136       /**
8137        * Check if we can expect to be able to fetch the entire result set by
8138        * asking for more fragments within the same batch. This may improve
8139        * performance for bushy scans, as subsequent bushy branches must be
8140        * re-executed for each batch of this scan.
8141        */
8142 
8143       /**
8144        * Find the maximal correlation value that we may have seen so far.
8145        * Correlation value must be unique within batch and smaller than
8146        * org->batch_size_rows.
8147        */
8148       const Uint32 maxCorrVal = (data.m_totalRows == 0) ? 0 :
8149         ((org->batch_size_rows / data.m_parallelism) * (data.m_parallelism - 1))
8150         + data.m_totalRows;
8151 
8152       // Number of rows & bytes that we can still fetch in this batch.
8153       const Int32 remainingRows
8154         = static_cast<Int32>(org->batch_size_rows - maxCorrVal);
8155       const Int32 remainingBytes
8156         = static_cast<Int32>(org->batch_size_bytes - data.m_totalBytes);
8157 
8158       if (remainingRows >= data.m_frags_not_started &&
8159           remainingBytes >= data.m_frags_not_started &&
8160           /**
8161            * Check that (remaning row capacity)/(remaining fragments) is
8162            * greater or equal to (rows read so far)/(finished fragments).
8163            */
8164           remainingRows * static_cast<Int32>(data.m_parallelism) >=
8165             static_cast<Int32>(data.m_totalRows * data.m_frags_not_started) &&
8166           remainingBytes * static_cast<Int32>(data.m_parallelism) >=
8167             static_cast<Int32>(data.m_totalBytes * data.m_frags_not_started))
8168       {
8169         jam();
8170         Uint32 batchRange = maxCorrVal;
8171         Uint32 bs_rows  = remainingRows / data.m_frags_not_started;
8172         Uint32 bs_bytes = remainingBytes / data.m_frags_not_started;
8173 
8174         DEBUG("::scanFrag_execSCAN_FRAGCONF() first batch was not full."
8175               " Asking for new batches from " << data.m_frags_not_started <<
8176               " fragments with " <<
8177               bs_rows  <<" rows and " <<
8178               bs_bytes << " bytes.");
8179 
8180         if (unlikely(bs_rows > bs_bytes))
8181           bs_rows = bs_bytes;
8182 
8183         Uint32 frags_started =
8184           scanFrag_send(signal,
8185                          requestPtr,
8186                          treeNodePtr,
8187                          data.m_frags_not_started,
8188                          bs_bytes,
8189                          bs_rows,
8190                          batchRange);
8191 
8192         if (likely(frags_started > 0))
8193           return;
8194 
8195         // Else: scanFrag_send() didn't send anything for some reason.
8196         // Need to continue into 'completion detection' below.
8197         jam();
8198       }
8199     } // if (isFirstBatch ...)
8200 
8201     if (data.m_rows_received == data.m_rows_expecting ||
8202         state == ScanFragHandle::SFH_WAIT_CLOSE)
8203     {
8204       jam();
8205       ndbassert(requestPtr.p->m_outstanding > 0);
8206       requestPtr.p->m_outstanding--;
8207       requestPtr.p->m_completed_tree_nodes.set(treeNodePtr.p->m_node_no);
8208       handleTreeNodeComplete(signal, requestPtr, treeNodePtr);
8209     }
8210   } // if (data.m_frags_outstanding == 0)
8211 }
8212 
8213 void
scanFrag_execSCAN_FRAGREF(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Ptr<ScanFragHandle> fragPtr)8214 Dbspj::scanFrag_execSCAN_FRAGREF(Signal* signal,
8215                                  Ptr<Request> requestPtr,
8216                                  Ptr<TreeNode> treeNodePtr,
8217                                  Ptr<ScanFragHandle> fragPtr)
8218 {
8219   jam();
8220 
8221   const ScanFragRef * rep = CAST_CONSTPTR(ScanFragRef, signal->getDataPtr());
8222   const Uint32 errCode = rep->errorCode;
8223 
8224   Uint32 state = fragPtr.p->m_state;
8225   ndbrequire(state == ScanFragHandle::SFH_SCANNING ||
8226              state == ScanFragHandle::SFH_WAIT_CLOSE);
8227 
8228   fragPtr.p->m_state = ScanFragHandle::SFH_COMPLETE;
8229 
8230   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8231   ndbrequire(data.m_frags_complete < data.m_fragCount);
8232   data.m_frags_complete++;
8233   ndbrequire(data.m_frags_outstanding > 0);
8234   data.m_frags_outstanding--;
8235 
8236   if (data.m_fragCount == (data.m_frags_complete + data.m_frags_not_started))
8237   {
8238     jam();
8239     ndbrequire(requestPtr.p->m_cnt_active);
8240     requestPtr.p->m_cnt_active--;
8241     treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
8242   }
8243 
8244   if (data.m_frags_outstanding == 0)
8245   {
8246     jam();
8247     ndbrequire(requestPtr.p->m_outstanding);
8248     requestPtr.p->m_outstanding--;
8249   }
8250 
8251   abort(signal, requestPtr, errCode);
8252 }
8253 
8254 void
scanFrag_execSCAN_NEXTREQ(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8255 Dbspj::scanFrag_execSCAN_NEXTREQ(Signal* signal,
8256                                  Ptr<Request> requestPtr,
8257                                  Ptr<TreeNode> treeNodePtr)
8258 {
8259   jam();
8260   Uint32 err = checkTableError(treeNodePtr);
8261   if (unlikely(err))
8262   {
8263     jam();
8264     abort(signal, requestPtr, err);
8265     return;
8266   }
8267 
8268   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8269   const ScanFragReq * org = (const ScanFragReq*)data.m_scanFragReq;
8270 
8271   data.m_rows_received = 0;
8272   data.m_rows_expecting = 0;
8273   ndbassert(data.m_frags_outstanding == 0);
8274 
8275   ndbrequire(data.m_frags_complete < data.m_fragCount);
8276   if ((treeNodePtr.p->m_bits & TreeNode::T_SCAN_PARALLEL) == 0)
8277   {
8278     jam();
8279     /**
8280      * Since fetching few but large batches is more efficient, we
8281      * set parallelism to the lowest value where we can still expect each
8282      * batch to be full.
8283      */
8284     if (data.m_largestBatchRows < org->batch_size_rows/data.m_parallelism &&
8285         data.m_largestBatchBytes < org->batch_size_bytes/data.m_parallelism)
8286     {
8287       jam();
8288       data.m_parallelism = MIN(data.m_fragCount - data.m_frags_complete,
8289                                org->batch_size_rows);
8290       if (data.m_largestBatchRows > 0)
8291       {
8292         jam();
8293         data.m_parallelism =
8294           MIN(org->batch_size_rows / data.m_largestBatchRows,
8295               data.m_parallelism);
8296       }
8297       if (data.m_largestBatchBytes > 0)
8298       {
8299         jam();
8300         data.m_parallelism =
8301           MIN(data.m_parallelism,
8302               org->batch_size_bytes/data.m_largestBatchBytes);
8303       }
8304       if (data.m_frags_complete == 0 &&
8305           data.m_frags_not_started % data.m_parallelism != 0)
8306       {
8307         jam();
8308         /**
8309          * Set parallelism such that we can expect to have similar
8310          * parallelism in each batch. For example if there are 8 remaining
8311          * fragments, then we should fetch 2 times 4 fragments rather than
8312          * 7+1.
8313          */
8314         const Uint32 roundTrips =
8315           1 + data.m_frags_not_started / data.m_parallelism;
8316         data.m_parallelism = data.m_frags_not_started / roundTrips;
8317       }
8318     }
8319     else
8320     {
8321       jam();
8322       // We get full batches, so we should lower parallelism.
8323       data.m_parallelism = MIN(data.m_fragCount - data.m_frags_complete,
8324                                MAX(1, data.m_parallelism/2));
8325     }
8326     if (data.m_parallelism < requestPtr.p->m_rootFragCnt)
8327     {
8328       // Avoid starting so few scans that some LDM-threads are sitting idle
8329       data.m_parallelism = MIN(data.m_fragCount - data.m_frags_complete,
8330       requestPtr.p->m_rootFragCnt);
8331     }
8332     ndbassert(data.m_parallelism > 0);
8333 #ifdef DEBUG_SCAN_FRAGREQ
8334     DEBUG("::scanFrag_execSCAN_NEXTREQ() Asking for new batches from " <<
8335           data.m_parallelism <<
8336           " fragments with " << org->batch_size_rows/data.m_parallelism <<
8337           " rows and " << org->batch_size_bytes/data.m_parallelism <<
8338           " bytes.");
8339 #endif
8340   }
8341   else // Max parallelism
8342   {
8343     jam();
8344     data.m_parallelism = MIN(data.m_fragCount - data.m_frags_complete,
8345                              org->batch_size_rows);
8346   }
8347 
8348   const Uint32 bs_rows = MIN(org->batch_size_rows/data.m_parallelism,
8349                              MAX_PARALLEL_OP_PER_SCAN);
8350   ndbassert(bs_rows > 0);
8351   ScanFragNextReq* req =
8352     reinterpret_cast<ScanFragNextReq*>(signal->getDataPtrSend());
8353   req->requestInfo = 0;
8354   ScanFragNextReq::setCorrFactorFlag(req->requestInfo);
8355   req->transId1 = requestPtr.p->m_transId[0];
8356   req->transId2 = requestPtr.p->m_transId[1];
8357   req->batch_size_rows = bs_rows;
8358   req->batch_size_bytes = org->batch_size_bytes/data.m_parallelism;
8359 
8360   Uint32 batchRange = 0;
8361   Ptr<ScanFragHandle> fragPtr;
8362   Uint32 sentFragCount = 0;
8363   {
8364     /**
8365      * First, ask for more data from fragments that are already started.
8366      */
8367     Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8368     list.first(fragPtr);
8369     while (sentFragCount < data.m_parallelism && !fragPtr.isNull())
8370     {
8371       jam();
8372       ndbassert(fragPtr.p->m_state == ScanFragHandle::SFH_WAIT_NEXTREQ ||
8373                 fragPtr.p->m_state == ScanFragHandle::SFH_COMPLETE ||
8374                 fragPtr.p->m_state == ScanFragHandle::SFH_NOT_STARTED);
8375       if (fragPtr.p->m_state == ScanFragHandle::SFH_WAIT_NEXTREQ)
8376       {
8377         jam();
8378 
8379         data.m_frags_outstanding++;
8380         req->variableData[0] = batchRange;
8381         fragPtr.p->m_state = ScanFragHandle::SFH_SCANNING;
8382         batchRange += bs_rows;
8383 
8384         DEBUG("scanFrag_execSCAN_NEXTREQ to: " << hex
8385               << fragPtr.p->m_ref
8386               << ", m_node_no=" << treeNodePtr.p->m_node_no
8387               << ", senderData: " << req->senderData);
8388 
8389 #ifdef DEBUG_SCAN_FRAGREQ
8390         printSCANFRAGNEXTREQ(stdout, &signal->theData[0],
8391                              ScanFragNextReq::SignalLength + 1, DBLQH);
8392 #endif
8393 
8394         req->senderData = fragPtr.i;
8395         sendSignal(fragPtr.p->m_ref, GSN_SCAN_NEXTREQ, signal,
8396                    ScanFragNextReq::SignalLength + 1,
8397                    JBB);
8398         sentFragCount++;
8399       }
8400       list.next(fragPtr);
8401     }
8402   }
8403 
8404   Uint32 frags_started = 0;
8405   if (sentFragCount < data.m_parallelism)
8406   {
8407     /**
8408      * Then start new fragments until we reach data.m_parallelism.
8409      */
8410     jam();
8411     ndbassert(data.m_frags_not_started != 0);
8412     frags_started =
8413       scanFrag_send(signal,
8414                      requestPtr,
8415                      treeNodePtr,
8416                      data.m_parallelism - sentFragCount,
8417                      org->batch_size_bytes/data.m_parallelism,
8418                      bs_rows,
8419                      batchRange);
8420   }
8421   /**
8422    * sendSignal() or scanFrag_send() might have failed to send:
8423    * Check that we really did send something before
8424    * updating outstanding & active.
8425    */
8426   if (likely(sentFragCount+frags_started > 0))
8427   {
8428     jam();
8429     ndbrequire(data.m_batch_chunks > 0);
8430     data.m_batch_chunks++;
8431 
8432     requestPtr.p->m_outstanding++;
8433     requestPtr.p->m_completed_tree_nodes.clear(treeNodePtr.p->m_node_no);
8434     ndbassert(treeNodePtr.p->m_state == TreeNode::TN_ACTIVE);
8435   }
8436 }
8437 
8438 void
scanFrag_complete(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8439 Dbspj::scanFrag_complete(Signal* signal,
8440                          Ptr<Request> requestPtr,
8441                          Ptr<TreeNode> treeNodePtr)
8442 {
8443   jam();
8444   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8445   if (!data.m_fragments.isEmpty())
8446   {
8447     jam();
8448     DihScanTabCompleteRep* rep=(DihScanTabCompleteRep*)signal->getDataPtrSend();
8449     rep->tableId = treeNodePtr.p->m_tableOrIndexId;
8450     rep->scanCookie = data.m_scanCookie;
8451     rep->jamBufferPtr = jamBuffer();
8452 
8453     EXECUTE_DIRECT_MT(DBDIH, GSN_DIH_SCAN_TAB_COMPLETE_REP,
8454                       signal, DihScanTabCompleteRep::SignalLength, 0);
8455   }
8456 }
8457 
8458 void
scanFrag_abort(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8459 Dbspj::scanFrag_abort(Signal* signal,
8460                       Ptr<Request> requestPtr,
8461                       Ptr<TreeNode> treeNodePtr)
8462 {
8463   jam();
8464 
8465   switch(treeNodePtr.p->m_state){
8466   case TreeNode::TN_BUILDING:
8467   case TreeNode::TN_PREPARING:
8468   case TreeNode::TN_INACTIVE:
8469   case TreeNode::TN_COMPLETING:
8470   case TreeNode::TN_END:
8471     DEBUG("scanFrag_abort"
8472 	  << ", transId: " << hex << requestPtr.p->m_transId[0]
8473 	  << ","           << hex << requestPtr.p->m_transId[1]
8474 	  << ", state: " << treeNodePtr.p->m_state);
8475     return;
8476 
8477   case TreeNode::TN_ACTIVE:
8478     jam();
8479     break;
8480   }
8481 
8482   ScanFragNextReq* req = CAST_PTR(ScanFragNextReq, signal->getDataPtrSend());
8483   req->requestInfo = 0;
8484   ScanFragNextReq::setCloseFlag(req->requestInfo, 1);
8485   req->transId1 = requestPtr.p->m_transId[0];
8486   req->transId2 = requestPtr.p->m_transId[1];
8487   req->batch_size_rows = 0;
8488   req->batch_size_bytes = 0;
8489 
8490   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8491   Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8492   Ptr<ScanFragHandle> fragPtr;
8493 
8494   Uint32 cnt_waiting = 0;
8495   Uint32 cnt_scanning = 0;
8496   for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
8497   {
8498     switch(fragPtr.p->m_state){
8499     case ScanFragHandle::SFH_NOT_STARTED:
8500     case ScanFragHandle::SFH_COMPLETE:
8501     case ScanFragHandle::SFH_WAIT_CLOSE:
8502       jam();
8503       break;
8504     case ScanFragHandle::SFH_WAIT_NEXTREQ:
8505       jam();
8506       cnt_waiting++;              // was idle...
8507       data.m_frags_outstanding++; // is closing
8508       goto do_abort;
8509     case ScanFragHandle::SFH_SCANNING:
8510       jam();
8511       cnt_scanning++;
8512       goto do_abort;
8513     do_abort:
8514       req->senderData = fragPtr.i;
8515       sendSignal(fragPtr.p->m_ref, GSN_SCAN_NEXTREQ, signal,
8516                  ScanFragNextReq::SignalLength, JBB);
8517 
8518       fragPtr.p->m_state = ScanFragHandle::SFH_WAIT_CLOSE;
8519       break;
8520     }
8521   }
8522 
8523   if (cnt_scanning == 0)
8524   {
8525     if (cnt_waiting > 0)
8526     {
8527       /**
8528        * If all were waiting...this should increase m_outstanding
8529        */
8530       jam();
8531       requestPtr.p->m_outstanding++;
8532     }
8533     else
8534     {
8535       /**
8536        * All fragments are either complete or not yet started, so there is
8537        * nothing to abort.
8538        */
8539       jam();
8540       ndbassert(data.m_frags_not_started > 0);
8541       ndbrequire(requestPtr.p->m_cnt_active);
8542       requestPtr.p->m_cnt_active--;
8543       treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
8544     }
8545   }
8546 }
8547 
8548 Uint32
scanFrag_execNODE_FAILREP(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const NdbNodeBitmask nodes)8549 Dbspj::scanFrag_execNODE_FAILREP(Signal* signal,
8550                                  Ptr<Request> requestPtr,
8551                                  Ptr<TreeNode> treeNodePtr,
8552                                  const NdbNodeBitmask nodes)
8553 {
8554   jam();
8555 
8556   switch(treeNodePtr.p->m_state){
8557   case TreeNode::TN_PREPARING:
8558   case TreeNode::TN_INACTIVE:
8559     return 1;
8560 
8561   case TreeNode::TN_BUILDING:
8562   case TreeNode::TN_COMPLETING:
8563   case TreeNode::TN_END:
8564     return 0;
8565 
8566   case TreeNode::TN_ACTIVE:
8567     jam();
8568     break;
8569   }
8570 
8571 
8572   Uint32 sum = 0;
8573   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8574   Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8575   Ptr<ScanFragHandle> fragPtr;
8576 
8577   Uint32 save0 = data.m_frags_outstanding;
8578   Uint32 save1 = data.m_frags_complete;
8579 
8580   for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
8581   {
8582     if (nodes.get(refToNode(fragPtr.p->m_ref)) == false)
8583     {
8584       jam();
8585       /**
8586        * No action needed
8587        */
8588       continue;
8589     }
8590 
8591     switch(fragPtr.p->m_state){
8592     case ScanFragHandle::SFH_NOT_STARTED:
8593       jam();
8594       ndbrequire(data.m_frags_complete < data.m_fragCount);
8595       data.m_frags_complete++;
8596       ndbrequire(data.m_frags_not_started > 0);
8597       data.m_frags_not_started--;
8598       // fall through
8599     case ScanFragHandle::SFH_COMPLETE:
8600       jam();
8601       sum++; // indicate that we should abort
8602       /**
8603        * we could keep list of all fragments...
8604        *   or execute DIGETNODES again...
8605        *   but for now, we don't
8606        */
8607       break;
8608     case ScanFragHandle::SFH_WAIT_CLOSE:
8609     case ScanFragHandle::SFH_SCANNING:
8610       jam();
8611       ndbrequire(data.m_frags_outstanding > 0);
8612       data.m_frags_outstanding--;
8613       // fall through
8614     case ScanFragHandle::SFH_WAIT_NEXTREQ:
8615       jam();
8616       sum++;
8617       ndbrequire(data.m_frags_complete < data.m_fragCount);
8618       data.m_frags_complete++;
8619       break;
8620     }
8621     fragPtr.p->m_ref = 0;
8622     fragPtr.p->m_state = ScanFragHandle::SFH_COMPLETE;
8623   }
8624 
8625   if (save0 != 0 && data.m_frags_outstanding == 0)
8626   {
8627     jam();
8628     ndbrequire(requestPtr.p->m_outstanding);
8629     requestPtr.p->m_outstanding--;
8630   }
8631 
8632   if (save1 != 0 &&
8633       data.m_fragCount == (data.m_frags_complete + data.m_frags_not_started))
8634   {
8635     jam();
8636     ndbrequire(requestPtr.p->m_cnt_active);
8637     requestPtr.p->m_cnt_active--;
8638     treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
8639   }
8640 
8641   return sum;
8642 }
8643 
8644 void
scanFrag_release_rangekeys(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8645 Dbspj::scanFrag_release_rangekeys(Ptr<Request> requestPtr,
8646                                   Ptr<TreeNode> treeNodePtr)
8647 {
8648   jam();
8649   DEBUG("scanFrag_release_rangekeys(), tree node " << treeNodePtr.i
8650           << " m_node_no: " << treeNodePtr.p->m_node_no);
8651 
8652   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8653   Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8654   Ptr<ScanFragHandle> fragPtr;
8655 
8656   if (treeNodePtr.p->m_bits & TreeNode::T_PRUNE_PATTERN)
8657   {
8658     jam();
8659     for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
8660     {
8661       if (fragPtr.p->m_rangePtrI != RNIL)
8662       {
8663         releaseSection(fragPtr.p->m_rangePtrI);
8664         fragPtr.p->m_rangePtrI = RNIL;
8665       }
8666     }
8667   }
8668   else
8669   {
8670     jam();
8671     if (!list.first(fragPtr))
8672       return;
8673     if (fragPtr.p->m_rangePtrI != RNIL)
8674     {
8675       releaseSection(fragPtr.p->m_rangePtrI);
8676       fragPtr.p->m_rangePtrI = RNIL;
8677     }
8678   }
8679 }
8680 
8681 /**
8682  * Parent batch has completed, and will not refetch (X-joined) results
8683  * from its childs. Release & reset range keys which are unsent or we
8684  * have kept for possible resubmits.
8685  */
8686 void
scanFrag_parent_batch_cleanup(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8687 Dbspj::scanFrag_parent_batch_cleanup(Ptr<Request> requestPtr,
8688                                      Ptr<TreeNode> treeNodePtr)
8689 {
8690   DEBUG("scanFrag_parent_batch_cleanup");
8691   scanFrag_release_rangekeys(requestPtr,treeNodePtr);
8692 }
8693 
8694 /**
8695  * Do final cleanup of specified TreeNode. There will be no
8696  * more (re-)execution of either this TreeNode nor other,
8697  * so no need to re-init for further execution.
8698  */
8699 void
scanFrag_cleanup(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8700 Dbspj::scanFrag_cleanup(Ptr<Request> requestPtr,
8701                         Ptr<TreeNode> treeNodePtr)
8702 {
8703   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8704   DEBUG("scanFrag_cleanup");
8705 
8706   /**
8707    * Range keys has been collected wherever there are uncompleted
8708    * parent batches...release them to avoid memleak.
8709    */
8710   scanFrag_release_rangekeys(requestPtr,treeNodePtr);
8711 
8712   if (treeNodePtr.p->m_bits & TreeNode::T_PRUNE_PATTERN)
8713   {
8714     jam();
8715     LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
8716     Local_pattern_store pattern(pool, data.m_prunePattern);
8717     pattern.release();
8718   }
8719   else if (treeNodePtr.p->m_bits & TreeNode::T_CONST_PRUNE)
8720   {
8721     jam();
8722     if (data.m_constPrunePtrI != RNIL)
8723     {
8724       jam();
8725       releaseSection(data.m_constPrunePtrI);
8726       data.m_constPrunePtrI = RNIL;
8727     }
8728   }
8729 
8730   cleanup_common(requestPtr, treeNodePtr);
8731 }
8732 
8733 
8734 bool
scanFrag_checkNode(const Ptr<Request> requestPtr,const Ptr<TreeNode> treeNodePtr)8735 Dbspj::scanFrag_checkNode(const Ptr<Request> requestPtr,
8736                           const Ptr<TreeNode> treeNodePtr)
8737 {
8738   jam();
8739   if (treeNodePtr.p->m_state != TreeNode::TN_ACTIVE)
8740   {
8741     return true;
8742   }
8743 
8744   bool checkResult = true;
8745 
8746   {
8747     ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8748     Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8749     Ptr<ScanFragHandle> fragPtr;
8750 
8751     Uint32 frags_not_started = 0;
8752     Uint32 frags_outstanding_scan = 0;
8753     Uint32 frags_outstanding_close = 0;
8754     Uint32 frags_waiting = 0;
8755     Uint32 frags_completed = 0;
8756 
8757     Uint32 fragCount = 0;
8758 
8759     for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
8760     {
8761       fragCount++;
8762       switch(fragPtr.p->m_state){
8763       case ScanFragHandle::SFH_NOT_STARTED:
8764         jam();
8765         frags_not_started++;
8766         break;
8767       case ScanFragHandle::SFH_SCANNING:
8768         jam();
8769         frags_outstanding_scan++;
8770         break;
8771       case ScanFragHandle::SFH_WAIT_CLOSE:
8772         jam();
8773         frags_outstanding_close++;
8774         break;
8775       case ScanFragHandle::SFH_WAIT_NEXTREQ:
8776         jam();
8777         frags_waiting++;
8778         break;
8779       case ScanFragHandle::SFH_COMPLETE:
8780         jam();
8781         frags_completed++;
8782         break;
8783       default:
8784         checkResult &= spjCheck(false);
8785         break;
8786       }
8787     }
8788 
8789     /**
8790      * Compare counters to state, state must be valid
8791      * at all stable points in time for execNODE_FAILREP
8792      * handling
8793      */
8794     checkResult &= spjCheck(data.m_frags_not_started == frags_not_started);
8795     checkResult &= spjCheck(data.m_frags_outstanding ==
8796                             (frags_outstanding_scan +
8797                              frags_outstanding_close));
8798     checkResult &= spjCheck(data.m_frags_complete == frags_completed);
8799   }
8800 
8801   return checkResult;
8802 }
8803 
8804 
8805 void
scanFrag_dumpNode(const Ptr<Request> requestPtr,const Ptr<TreeNode> treeNodePtr)8806 Dbspj::scanFrag_dumpNode(const Ptr<Request> requestPtr,
8807                          const Ptr<TreeNode> treeNodePtr)
8808 {
8809   jam();
8810 
8811   /* Non const ref due to list iteration below */
8812   ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8813 
8814   g_eventLogger->info("DBSPJ %u :       ScanFrag fragCount %u frags_complete %u "
8815                       "frags_outstanding %u frags_not_started %u ",
8816                       instance(),
8817                       data.m_fragCount,
8818                       data.m_frags_complete,
8819                       data.m_frags_outstanding,
8820                       data.m_frags_not_started);
8821   g_eventLogger->info("DBSPJ %u :       parallelism %u rows_expecting %u "
8822                       "rows_received %u firstBatch %u",
8823                       instance(),
8824                       data.m_parallelism,
8825                       data.m_rows_expecting,
8826                       data.m_rows_received,
8827                       data.m_firstBatch);
8828   g_eventLogger->info("DBSPJ %u :       totalRows %u totalBytes %u "
8829                       "constPrunePtrI %u",
8830                       instance(),
8831                       data.m_totalRows,
8832                       data.m_totalBytes,
8833                       data.m_constPrunePtrI);
8834   {
8835     Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8836     Ptr<ScanFragHandle> fragPtr;
8837     for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
8838     {
8839       dumpScanFragHandle(fragPtr);
8840     }
8841   }
8842 }
8843 
8844 /**
8845  * END - MODULE SCAN FRAGMENT
8846  */
8847 
8848 /**
8849  * Static OpInfo handling
8850  */
8851 const Dbspj::OpInfo*
getOpInfo(Uint32 op)8852 Dbspj::getOpInfo(Uint32 op)
8853 {
8854   DEBUG("getOpInfo(" << op << ")");
8855   switch(op){
8856   case QueryNode::QN_LOOKUP:
8857     return &Dbspj::g_LookupOpInfo;
8858   case QueryNode::QN_SCAN_FRAG_v1:
8859     return NULL; //Deprecated, converted into QN_SCAN_FRAG
8860   case QueryNode::QN_SCAN_INDEX_v1:
8861     return NULL; //Deprecated, converted into QN_SCAN_FRAG
8862   case QueryNode::QN_SCAN_FRAG:
8863     return &Dbspj::g_ScanFragOpInfo;
8864   default:
8865     return 0;
8866   }
8867 }
8868 
8869 /**
8870  * MODULE COMMON PARSE/UNPACK
8871  */
8872 
8873 /**
8874  *  @returns dstLen + 1 on error
8875  */
8876 static
8877 Uint32
unpackList(Uint32 dstLen,Uint32 * dst,Dbspj::DABuffer & buffer)8878 unpackList(Uint32 dstLen, Uint32 * dst, Dbspj::DABuffer & buffer)
8879 {
8880   const Uint32 * ptr = buffer.ptr;
8881   if (likely(ptr != buffer.end))
8882   {
8883     Uint32 tmp = * ptr++;
8884     Uint32 cnt = tmp & 0xFFFF;
8885 
8886     * dst ++ = (tmp >> 16); // Store first
8887     DEBUG("cnt: " << cnt << " first: " << (tmp >> 16));
8888 
8889     if (cnt > 1)
8890     {
8891       Uint32 len = cnt / 2;
8892       if (unlikely(cnt >= dstLen || (ptr + len > buffer.end)))
8893         goto error;
8894 
8895       cnt --; // subtract item stored in header
8896 
8897       for (Uint32 i = 0; i < cnt/2; i++)
8898       {
8899         * dst++ = (* ptr) & 0xFFFF;
8900         * dst++ = (* ptr) >> 16;
8901         ptr++;
8902       }
8903 
8904       if (cnt & 1)
8905       {
8906         * dst ++ = * ptr & 0xFFFF;
8907         ptr++;
8908       }
8909 
8910       cnt ++; // readd item stored in header
8911     }
8912     buffer.ptr = ptr;
8913     return cnt;
8914   }
8915   return 0;
8916 
8917 error:
8918   return dstLen + 1;
8919 }
8920 
8921 /**
8922  * This function takes an array of attrinfo, and builds "header"
8923  *   which can be used to do random access inside the row
8924  */
8925 Uint32
buildRowHeader(RowPtr::Header * header,SegmentedSectionPtr ptr)8926 Dbspj::buildRowHeader(RowPtr::Header * header, SegmentedSectionPtr ptr)
8927 {
8928   Uint32 tmp, len;
8929   Uint32 * dst = header->m_offset;
8930   const Uint32 * const save = dst;
8931   SectionReader r0(ptr, getSectionSegmentPool());
8932   Uint32 offset = 0;
8933   do
8934   {
8935     * dst++ = offset;
8936     r0.getWord(&tmp);
8937     len = AttributeHeader::getDataSize(tmp);
8938     offset += 1 + len;
8939   } while (r0.step(len));
8940 
8941   return header->m_len = static_cast<Uint32>(dst - save);
8942 }
8943 
8944 /**
8945  * This function takes an array of attrinfo, and builds "header"
8946  *   which can be used to do random access inside the row
8947  */
8948 Uint32
buildRowHeader(RowPtr::Header * header,const Uint32 * & src,Uint32 len)8949 Dbspj::buildRowHeader(RowPtr::Header * header, const Uint32 *& src, Uint32 len)
8950 {
8951   Uint32 * dst = header->m_offset;
8952   const Uint32 * save = dst;
8953   Uint32 offset = 0;
8954   for (Uint32 i = 0; i<len; i++)
8955   {
8956     * dst ++ = offset;
8957     Uint32 tmp = * src++;
8958     Uint32 tmp_len = AttributeHeader::getDataSize(tmp);
8959     offset += 1 + tmp_len;
8960     src += tmp_len;
8961   }
8962 
8963   return header->m_len = static_cast<Uint32>(dst - save);
8964 }
8965 
8966 Uint32
appendToPattern(Local_pattern_store & pattern,DABuffer & tree,Uint32 len)8967 Dbspj::appendToPattern(Local_pattern_store & pattern,
8968                        DABuffer & tree, Uint32 len)
8969 {
8970   jam();
8971   if (unlikely(tree.ptr + len > tree.end))
8972     return DbspjErr::InvalidTreeNodeSpecification;
8973 
8974   if (ERROR_INSERTED_CLEAR(17008))
8975   {
8976     ndbout_c("Injecting OutOfQueryMemory error 17008 at line %d file %s",
8977              __LINE__,  __FILE__);
8978     jam();
8979     return DbspjErr::OutOfQueryMemory;
8980   }
8981   if (unlikely(pattern.append(tree.ptr, len)==0))
8982     return DbspjErr::OutOfQueryMemory;
8983 
8984   tree.ptr += len;
8985   return 0;
8986 }
8987 
8988 Uint32
appendParamToPattern(Local_pattern_store & dst,const RowPtr::Linear & row,Uint32 col)8989 Dbspj::appendParamToPattern(Local_pattern_store& dst,
8990                             const RowPtr::Linear & row, Uint32 col)
8991 {
8992   jam();
8993   Uint32 offset = row.m_header->m_offset[col];
8994   const Uint32 * ptr = row.m_data + offset;
8995   Uint32 len = AttributeHeader::getDataSize(* ptr ++);
8996   /* Param COL's converted to DATA when appended to pattern */
8997   Uint32 info = QueryPattern::data(len);
8998 
8999   if (ERROR_INSERTED_CLEAR(17009))
9000   {
9001     ndbout_c("Injecting OutOfQueryMemory error 17009 at line %d file %s",
9002              __LINE__,  __FILE__);
9003     jam();
9004     return DbspjErr::OutOfQueryMemory;
9005   }
9006 
9007   return dst.append(&info,1) && dst.append(ptr,len) ? 0 : DbspjErr::OutOfQueryMemory;
9008 }
9009 
9010 #ifdef ERROR_INSERT
9011 static int fi_cnt = 0;
9012 bool
appendToSection(Uint32 & firstSegmentIVal,const Uint32 * src,Uint32 len)9013 Dbspj::appendToSection(Uint32& firstSegmentIVal,
9014                          const Uint32* src, Uint32 len)
9015 {
9016   if (ERROR_INSERTED(17510) && fi_cnt++ % 13 == 0)
9017   {
9018     jam();
9019     ndbout_c("Injecting appendToSection error 17510 at line %d file %s",
9020              __LINE__,  __FILE__);
9021     return false;
9022   }
9023   else
9024   {
9025     return SimulatedBlock::appendToSection(firstSegmentIVal, src, len);
9026   }
9027 }
9028 #endif
9029 
9030 Uint32
appendParamHeadToPattern(Local_pattern_store & dst,const RowPtr::Linear & row,Uint32 col)9031 Dbspj::appendParamHeadToPattern(Local_pattern_store& dst,
9032                                 const RowPtr::Linear & row, Uint32 col)
9033 {
9034   jam();
9035   Uint32 offset = row.m_header->m_offset[col];
9036   const Uint32 * ptr = row.m_data + offset;
9037   Uint32 len = AttributeHeader::getDataSize(*ptr);
9038   /* Param COL's converted to DATA when appended to pattern */
9039   Uint32 info = QueryPattern::data(len+1);
9040 
9041   if (ERROR_INSERTED_CLEAR(17010))
9042   {
9043     ndbout_c("Injecting OutOfQueryMemory error 17010 at line %d file %s",
9044              __LINE__,  __FILE__);
9045     jam();
9046     return DbspjErr::OutOfQueryMemory;
9047   }
9048 
9049   return dst.append(&info,1) && dst.append(ptr,len+1) ? 0 : DbspjErr::OutOfQueryMemory;
9050 }
9051 
9052 Uint32
appendReaderToSection(Uint32 & ptrI,SectionReader & reader,Uint32 len)9053 Dbspj::appendReaderToSection(Uint32 &ptrI, SectionReader &reader, Uint32 len)
9054 {
9055   while (len > 0)
9056   {
9057     jam();
9058     const Uint32* readPtr;
9059     Uint32 readLen;
9060     ndbrequire(reader.getWordsPtr(len, readPtr, readLen));
9061     if (unlikely(!appendToSection(ptrI, readPtr, readLen)))
9062       return DbspjErr::OutOfSectionMemory;
9063     len -= readLen;
9064   }
9065   return 0;
9066 }
9067 
9068 void
getCorrelationData(const RowPtr::Section & row,Uint32 col,Uint32 & correlationNumber)9069 Dbspj::getCorrelationData(const RowPtr::Section & row,
9070                           Uint32 col,
9071                           Uint32& correlationNumber)
9072 {
9073   /**
9074    * TODO handle errors
9075    */
9076   SegmentedSectionPtr ptr(row.m_dataPtr);
9077   SectionReader reader(ptr, getSectionSegmentPool());
9078   Uint32 offset = row.m_header->m_offset[col];
9079   ndbrequire(reader.step(offset));
9080   Uint32 tmp;
9081   ndbrequire(reader.getWord(&tmp));
9082   Uint32 len = AttributeHeader::getDataSize(tmp);
9083   ndbrequire(len == 1);
9084   ndbrequire(AttributeHeader::getAttributeId(tmp) == AttributeHeader::CORR_FACTOR32);
9085   ndbrequire(reader.getWord(&correlationNumber));
9086 }
9087 
9088 void
getCorrelationData(const RowPtr::Linear & row,Uint32 col,Uint32 & correlationNumber)9089 Dbspj::getCorrelationData(const RowPtr::Linear & row,
9090                           Uint32 col,
9091                           Uint32& correlationNumber)
9092 {
9093   /**
9094    * TODO handle errors
9095    */
9096   Uint32 offset = row.m_header->m_offset[col];
9097   Uint32 tmp = row.m_data[offset];
9098   Uint32 len = AttributeHeader::getDataSize(tmp);
9099   ndbrequire(len == 1);
9100   ndbrequire(AttributeHeader::getAttributeId(tmp) == AttributeHeader::CORR_FACTOR32);
9101   correlationNumber = row.m_data[offset+1];
9102 }
9103 
9104 Uint32
appendColToSection(Uint32 & dst,const RowPtr::Section & row,Uint32 col,bool & hasNull)9105 Dbspj::appendColToSection(Uint32 & dst, const RowPtr::Section & row,
9106                           Uint32 col, bool& hasNull)
9107 {
9108   jam();
9109   /**
9110    * TODO handle errors
9111    */
9112   SegmentedSectionPtr ptr(row.m_dataPtr);
9113   SectionReader reader(ptr, getSectionSegmentPool());
9114   Uint32 offset = row.m_header->m_offset[col];
9115   ndbrequire(reader.step(offset));
9116   Uint32 tmp;
9117   ndbrequire(reader.getWord(&tmp));
9118   Uint32 len = AttributeHeader::getDataSize(tmp);
9119   if (unlikely(len==0))
9120   {
9121     jam();
9122     hasNull = true;  // NULL-value in key
9123     return 0;
9124   }
9125   return appendReaderToSection(dst, reader, len);
9126 }
9127 
9128 Uint32
appendColToSection(Uint32 & dst,const RowPtr::Linear & row,Uint32 col,bool & hasNull)9129 Dbspj::appendColToSection(Uint32 & dst, const RowPtr::Linear & row,
9130                           Uint32 col, bool& hasNull)
9131 {
9132   jam();
9133   Uint32 offset = row.m_header->m_offset[col];
9134   const Uint32 * ptr = row.m_data + offset;
9135   Uint32 len = AttributeHeader::getDataSize(* ptr ++);
9136   if (unlikely(len==0))
9137   {
9138     jam();
9139     hasNull = true;  // NULL-value in key
9140     return 0;
9141   }
9142   return appendToSection(dst, ptr, len) ? 0 : DbspjErr::OutOfSectionMemory;
9143 }
9144 
9145 Uint32
appendAttrinfoToSection(Uint32 & dst,const RowPtr::Linear & row,Uint32 col,bool & hasNull)9146 Dbspj::appendAttrinfoToSection(Uint32 & dst, const RowPtr::Linear & row,
9147                                Uint32 col, bool& hasNull)
9148 {
9149   jam();
9150   Uint32 offset = row.m_header->m_offset[col];
9151   const Uint32 * ptr = row.m_data + offset;
9152   Uint32 len = AttributeHeader::getDataSize(* ptr);
9153   if (unlikely(len==0))
9154   {
9155     jam();
9156     hasNull = true;  // NULL-value in key
9157   }
9158   return appendToSection(dst, ptr, 1 + len) ? 0 : DbspjErr::OutOfSectionMemory;
9159 }
9160 
9161 Uint32
appendAttrinfoToSection(Uint32 & dst,const RowPtr::Section & row,Uint32 col,bool & hasNull)9162 Dbspj::appendAttrinfoToSection(Uint32 & dst, const RowPtr::Section & row,
9163                                Uint32 col, bool& hasNull)
9164 {
9165   jam();
9166   /**
9167    * TODO handle errors
9168    */
9169   SegmentedSectionPtr ptr(row.m_dataPtr);
9170   SectionReader reader(ptr, getSectionSegmentPool());
9171   Uint32 offset = row.m_header->m_offset[col];
9172   ndbrequire(reader.step(offset));
9173   Uint32 tmp;
9174   ndbrequire(reader.peekWord(&tmp));
9175   Uint32 len = AttributeHeader::getDataSize(tmp);
9176   if (unlikely(len==0))
9177   {
9178     jam();
9179     hasNull = true;  // NULL-value in key
9180   }
9181   return appendReaderToSection(dst, reader, 1 + len);
9182 }
9183 
9184 /**
9185  * 'PkCol' is the composite NDB$PK column in an unique index consisting of
9186  * a fragment id and the composite PK value (all PK columns concatenated)
9187  */
9188 Uint32
appendPkColToSection(Uint32 & dst,const RowPtr::Section & row,Uint32 col)9189 Dbspj::appendPkColToSection(Uint32 & dst, const RowPtr::Section & row, Uint32 col)
9190 {
9191   jam();
9192   /**
9193    * TODO handle errors
9194    */
9195   SegmentedSectionPtr ptr(row.m_dataPtr);
9196   SectionReader reader(ptr, getSectionSegmentPool());
9197   Uint32 offset = row.m_header->m_offset[col];
9198   ndbrequire(reader.step(offset));
9199   Uint32 tmp;
9200   ndbrequire(reader.getWord(&tmp));
9201   Uint32 len = AttributeHeader::getDataSize(tmp);
9202   ndbrequire(len>1);  // NULL-value in PkKey is an error
9203   ndbrequire(reader.step(1)); // Skip fragid
9204   return appendReaderToSection(dst, reader, len-1);
9205 }
9206 
9207 /**
9208  * 'PkCol' is the composite NDB$PK column in an unique index consisting of
9209  * a fragment id and the composite PK value (all PK columns concatenated)
9210  */
9211 Uint32
appendPkColToSection(Uint32 & dst,const RowPtr::Linear & row,Uint32 col)9212 Dbspj::appendPkColToSection(Uint32 & dst, const RowPtr::Linear & row, Uint32 col)
9213 {
9214   jam();
9215   Uint32 offset = row.m_header->m_offset[col];
9216   Uint32 tmp = row.m_data[offset];
9217   Uint32 len = AttributeHeader::getDataSize(tmp);
9218   ndbrequire(len>1);  // NULL-value in PkKey is an error
9219   return appendToSection(dst, row.m_data+offset+2, len - 1) ? 0 : DbspjErr::OutOfSectionMemory;
9220 }
9221 
9222 Uint32
appendFromParent(Uint32 & dst,Local_pattern_store & pattern,Local_pattern_store::ConstDataBufferIterator & it,Uint32 levels,const RowPtr & rowptr,bool & hasNull)9223 Dbspj::appendFromParent(Uint32 & dst, Local_pattern_store& pattern,
9224                         Local_pattern_store::ConstDataBufferIterator& it,
9225                         Uint32 levels, const RowPtr & rowptr,
9226                         bool& hasNull)
9227 {
9228   jam();
9229   Ptr<TreeNode> treeNodePtr;
9230   m_treenode_pool.getPtr(treeNodePtr, rowptr.m_src_node_ptrI);
9231   Uint32 corrVal = rowptr.m_src_correlation;
9232   RowPtr targetRow;
9233   DEBUG("appendFromParent-of"
9234      << " node: " << treeNodePtr.p->m_node_no);
9235   while (levels--)
9236   {
9237     jam();
9238     if (unlikely(treeNodePtr.p->m_parentPtrI == RNIL))
9239     {
9240       DEBUG_CRASH();
9241       return DbspjErr::InvalidPattern;
9242     }
9243     m_treenode_pool.getPtr(treeNodePtr, treeNodePtr.p->m_parentPtrI);
9244     DEBUG("appendFromParent"
9245        << ", node: " << treeNodePtr.p->m_node_no);
9246     if (unlikely(treeNodePtr.p->m_rows.m_type != RowCollection::COLLECTION_MAP))
9247     {
9248       DEBUG_CRASH();
9249       return DbspjErr::InvalidPattern;
9250     }
9251 
9252     RowRef ref;
9253     treeNodePtr.p->m_rows.m_map.copyto(ref);
9254     const Uint32* const mapptr = get_row_ptr(ref);
9255 
9256     Uint32 pos = corrVal >> 16; // parent corr-val
9257     if (unlikely(! (pos < treeNodePtr.p->m_rows.m_map.m_size)))
9258     {
9259       DEBUG_CRASH();
9260       return DbspjErr::InvalidPattern;
9261     }
9262 
9263     // load ref to parent row
9264     treeNodePtr.p->m_rows.m_map.load(mapptr, pos, ref);
9265 
9266     const Uint32* const rowptr = get_row_ptr(ref);
9267     setupRowPtr(treeNodePtr, targetRow, ref, rowptr);
9268 
9269     if (levels)
9270     {
9271       jam();
9272       getCorrelationData(targetRow.m_row_data.m_linear,
9273                          targetRow.m_row_data.m_linear.m_header->m_len - 1,
9274                          corrVal);
9275     }
9276   }
9277 
9278   if (unlikely(it.isNull()))
9279   {
9280     DEBUG_CRASH();
9281     return DbspjErr::InvalidPattern;
9282   }
9283 
9284   Uint32 info = *it.data;
9285   Uint32 type = QueryPattern::getType(info);
9286   Uint32 val = QueryPattern::getLength(info);
9287   pattern.next(it);
9288   switch(type){
9289   case QueryPattern::P_COL:
9290     jam();
9291     return appendColToSection(dst, targetRow.m_row_data.m_linear, val, hasNull);
9292   case QueryPattern::P_UNQ_PK:
9293     jam();
9294     return appendPkColToSection(dst, targetRow.m_row_data.m_linear, val);
9295   case QueryPattern::P_ATTRINFO:
9296     jam();
9297     return appendAttrinfoToSection(dst, targetRow.m_row_data.m_linear, val, hasNull);
9298   case QueryPattern::P_DATA:
9299     jam();
9300     // retreiving DATA from parent...is...an error
9301     DEBUG_CRASH();
9302     return DbspjErr::InvalidPattern;
9303   case QueryPattern::P_PARENT:
9304     jam();
9305     // no point in nesting P_PARENT...an error
9306     DEBUG_CRASH();
9307     return DbspjErr::InvalidPattern;
9308   case QueryPattern::P_PARAM:
9309   case QueryPattern::P_PARAM_HEADER:
9310     jam();
9311     // should have been expanded during build
9312     DEBUG_CRASH();
9313     return DbspjErr::InvalidPattern;
9314   default:
9315     jam();
9316     DEBUG_CRASH();
9317     return DbspjErr::InvalidPattern;
9318   }
9319 }
9320 
9321 Uint32
appendDataToSection(Uint32 & ptrI,Local_pattern_store & pattern,Local_pattern_store::ConstDataBufferIterator & it,Uint32 len,bool & hasNull)9322 Dbspj::appendDataToSection(Uint32 & ptrI,
9323                            Local_pattern_store& pattern,
9324                            Local_pattern_store::ConstDataBufferIterator& it,
9325                            Uint32 len, bool& hasNull)
9326 {
9327   jam();
9328   if (unlikely(len==0))
9329   {
9330     jam();
9331     hasNull = true;
9332     return 0;
9333   }
9334 
9335 #if 0
9336   /**
9337    * TODO handle errors
9338    */
9339   Uint32 tmp[NDB_SECTION_SEGMENT_SZ];
9340   while (len > NDB_SECTION_SEGMENT_SZ)
9341   {
9342     pattern.copyout(tmp, NDB_SECTION_SEGMENT_SZ, it);
9343     appendToSection(ptrI, tmp, NDB_SECTION_SEGMENT_SZ);
9344     len -= NDB_SECTION_SEGMENT_SZ;
9345   }
9346 
9347   pattern.copyout(tmp, len, it);
9348   appendToSection(ptrI, tmp, len);
9349   return 0;
9350 #else
9351   Uint32 remaining = len;
9352   Uint32 dstIdx = 0;
9353   Uint32 tmp[NDB_SECTION_SEGMENT_SZ];
9354 
9355   while (remaining > 0 && !it.isNull())
9356   {
9357     tmp[dstIdx] = *it.data;
9358     remaining--;
9359     dstIdx++;
9360     pattern.next(it);
9361     if (dstIdx == NDB_SECTION_SEGMENT_SZ || remaining == 0)
9362     {
9363       if (!appendToSection(ptrI, tmp, dstIdx))
9364       {
9365         jam();
9366         return DbspjErr::OutOfSectionMemory;
9367       }
9368       dstIdx = 0;
9369     }
9370   }
9371   if (remaining > 0)
9372   {
9373     DEBUG_CRASH();
9374     return DbspjErr::InvalidPattern;
9375   }
9376   else
9377   {
9378     return 0;
9379   }
9380 #endif
9381 }
9382 
9383 /**
9384  * This function takes a pattern and a row and expands it into a section
9385  */
9386 Uint32
expandS(Uint32 & _dst,Local_pattern_store & pattern,const RowPtr & row,bool & hasNull)9387 Dbspj::expandS(Uint32 & _dst, Local_pattern_store& pattern,
9388                const RowPtr & row, bool& hasNull)
9389 {
9390   Uint32 err;
9391   Uint32 dst = _dst;
9392   hasNull = false;
9393   Local_pattern_store::ConstDataBufferIterator it;
9394   pattern.first(it);
9395   while (!it.isNull())
9396   {
9397     Uint32 info = *it.data;
9398     Uint32 type = QueryPattern::getType(info);
9399     Uint32 val = QueryPattern::getLength(info);
9400     pattern.next(it);
9401     switch(type){
9402     case QueryPattern::P_COL:
9403       jam();
9404       err = appendColToSection(dst, row.m_row_data.m_section, val, hasNull);
9405       break;
9406     case QueryPattern::P_UNQ_PK:
9407       jam();
9408       err = appendPkColToSection(dst, row.m_row_data.m_section, val);
9409       break;
9410     case QueryPattern::P_ATTRINFO:
9411       jam();
9412       err = appendAttrinfoToSection(dst, row.m_row_data.m_section, val, hasNull);
9413       break;
9414     case QueryPattern::P_DATA:
9415       jam();
9416       err = appendDataToSection(dst, pattern, it, val, hasNull);
9417       break;
9418     case QueryPattern::P_PARENT:
9419       jam();
9420       // P_PARENT is a prefix to another pattern token
9421       // that permits code to access rows from earlier than immediate parent.
9422       // val is no of levels to move up the tree
9423       err = appendFromParent(dst, pattern, it, val, row, hasNull);
9424       break;
9425       // PARAM's was converted to DATA by ::expand(pattern...)
9426     case QueryPattern::P_PARAM:
9427     case QueryPattern::P_PARAM_HEADER:
9428     default:
9429       jam();
9430       err = DbspjErr::InvalidPattern;
9431       DEBUG_CRASH();
9432     }
9433     if (unlikely(err != 0))
9434     {
9435       jam();
9436       _dst = dst;
9437       return err;
9438     }
9439   }
9440 
9441   _dst = dst;
9442   return 0;
9443 }
9444 
9445 /**
9446  * This function takes a pattern and a row and expands it into a section
9447  */
9448 Uint32
expandL(Uint32 & _dst,Local_pattern_store & pattern,const RowPtr & row,bool & hasNull)9449 Dbspj::expandL(Uint32 & _dst, Local_pattern_store& pattern,
9450                const RowPtr & row, bool& hasNull)
9451 {
9452   Uint32 err;
9453   Uint32 dst = _dst;
9454   hasNull = false;
9455   Local_pattern_store::ConstDataBufferIterator it;
9456   pattern.first(it);
9457   while (!it.isNull())
9458   {
9459     Uint32 info = *it.data;
9460     Uint32 type = QueryPattern::getType(info);
9461     Uint32 val = QueryPattern::getLength(info);
9462     pattern.next(it);
9463     switch(type){
9464     case QueryPattern::P_COL:
9465       jam();
9466       err = appendColToSection(dst, row.m_row_data.m_linear, val, hasNull);
9467       break;
9468     case QueryPattern::P_UNQ_PK:
9469       jam();
9470       err = appendPkColToSection(dst, row.m_row_data.m_linear, val);
9471       break;
9472     case QueryPattern::P_ATTRINFO:
9473       jam();
9474       err = appendAttrinfoToSection(dst, row.m_row_data.m_linear, val, hasNull);
9475       break;
9476     case QueryPattern::P_DATA:
9477       jam();
9478       err = appendDataToSection(dst, pattern, it, val, hasNull);
9479       break;
9480     case QueryPattern::P_PARENT:
9481       jam();
9482       // P_PARENT is a prefix to another pattern token
9483       // that permits code to access rows from earlier than immediate parent
9484       // val is no of levels to move up the tree
9485       err = appendFromParent(dst, pattern, it, val, row, hasNull);
9486       break;
9487       // PARAM's was converted to DATA by ::expand(pattern...)
9488     case QueryPattern::P_PARAM:
9489     case QueryPattern::P_PARAM_HEADER:
9490     default:
9491       jam();
9492       err = DbspjErr::InvalidPattern;
9493       DEBUG_CRASH();
9494     }
9495     if (unlikely(err != 0))
9496     {
9497       jam();
9498       _dst = dst;
9499       return err;
9500     }
9501   }
9502 
9503   _dst = dst;
9504   return 0;
9505 }
9506 
9507 /* ::expand() used during initial 'build' phase on 'tree' + 'param' from API */
9508 Uint32
expand(Uint32 & ptrI,DABuffer & pattern,Uint32 len,DABuffer & param,Uint32 paramCnt,bool & hasNull)9509 Dbspj::expand(Uint32 & ptrI, DABuffer& pattern, Uint32 len,
9510               DABuffer& param, Uint32 paramCnt, bool& hasNull)
9511 {
9512   jam();
9513   /**
9514    * TODO handle error
9515    */
9516   Uint32 err = 0;
9517   Uint32 tmp[1+MAX_ATTRIBUTES_IN_TABLE];
9518   struct RowPtr::Linear row;
9519   row.m_data = param.ptr;
9520   row.m_header = CAST_PTR(RowPtr::Header, &tmp[0]);
9521   buildRowHeader(CAST_PTR(RowPtr::Header, &tmp[0]), param.ptr, paramCnt);
9522 
9523   Uint32 dst = ptrI;
9524   const Uint32 * ptr = pattern.ptr;
9525   const Uint32 * end = ptr + len;
9526   hasNull = false;
9527 
9528   for (; ptr < end; )
9529   {
9530     Uint32 info = * ptr++;
9531     Uint32 type = QueryPattern::getType(info);
9532     Uint32 val = QueryPattern::getLength(info);
9533     switch(type){
9534     case QueryPattern::P_PARAM:
9535       jam();
9536       ndbassert(val < paramCnt);
9537       err = appendColToSection(dst, row, val, hasNull);
9538       break;
9539     case QueryPattern::P_PARAM_HEADER:
9540       jam();
9541       ndbassert(val < paramCnt);
9542       err = appendAttrinfoToSection(dst, row, val, hasNull);
9543       break;
9544     case QueryPattern::P_DATA:
9545       if (unlikely(val==0))
9546       {
9547         jam();
9548         hasNull = true;
9549       }
9550       else if (likely(appendToSection(dst, ptr, val)))
9551       {
9552         jam();
9553         ptr += val;
9554       }
9555       else
9556       {
9557         jam();
9558         err = DbspjErr::OutOfSectionMemory;
9559       }
9560       break;
9561     case QueryPattern::P_COL:    // (linked) COL's not expected here
9562     case QueryPattern::P_PARENT: // Prefix to P_COL
9563     case QueryPattern::P_ATTRINFO:
9564     case QueryPattern::P_UNQ_PK:
9565     default:
9566       jam();
9567       jamLine(type);
9568       err = DbspjErr::InvalidPattern;
9569     }
9570     if (unlikely(err != 0))
9571     {
9572       jam();
9573       ptrI = dst;
9574       return err;
9575     }
9576   }
9577 
9578   /**
9579    * Iterate forward
9580    */
9581   pattern.ptr = end;
9582   ptrI = dst;
9583   return 0;
9584 }
9585 
9586 /* ::expand() used during initial 'build' phase on 'tree' + 'param' from API */
9587 Uint32
expand(Local_pattern_store & dst,Ptr<TreeNode> treeNodePtr,DABuffer & pattern,Uint32 len,DABuffer & param,Uint32 paramCnt)9588 Dbspj::expand(Local_pattern_store& dst, Ptr<TreeNode> treeNodePtr,
9589               DABuffer& pattern, Uint32 len,
9590               DABuffer& param, Uint32 paramCnt)
9591 {
9592   jam();
9593   /**
9594    * TODO handle error
9595    */
9596   Uint32 err;
9597   Uint32 tmp[1+MAX_ATTRIBUTES_IN_TABLE];
9598   struct RowPtr::Linear row;
9599   row.m_header = CAST_PTR(RowPtr::Header, &tmp[0]);
9600   row.m_data = param.ptr;
9601   buildRowHeader(CAST_PTR(RowPtr::Header, &tmp[0]), param.ptr, paramCnt);
9602 
9603   const Uint32 * end = pattern.ptr + len;
9604   for (; pattern.ptr < end; )
9605   {
9606     Uint32 info = *pattern.ptr;
9607     Uint32 type = QueryPattern::getType(info);
9608     Uint32 val = QueryPattern::getLength(info);
9609     switch(type){
9610     case QueryPattern::P_COL:
9611     case QueryPattern::P_UNQ_PK:
9612     case QueryPattern::P_ATTRINFO:
9613       jam();
9614       err = appendToPattern(dst, pattern, 1);
9615       break;
9616     case QueryPattern::P_DATA:
9617       jam();
9618       err = appendToPattern(dst, pattern, val+1);
9619       break;
9620     case QueryPattern::P_PARAM:
9621       jam();
9622       // NOTE: Converted to P_DATA by appendParamToPattern
9623       ndbassert(val < paramCnt);
9624       err = appendParamToPattern(dst, row, val);
9625       pattern.ptr++;
9626       break;
9627     case QueryPattern::P_PARAM_HEADER:
9628       jam();
9629       // NOTE: Converted to P_DATA by appendParamHeadToPattern
9630       ndbassert(val < paramCnt);
9631       err = appendParamHeadToPattern(dst, row, val);
9632       pattern.ptr++;
9633       break;
9634     case QueryPattern::P_PARENT: // Prefix to P_COL
9635     {
9636       jam();
9637       err = appendToPattern(dst, pattern, 1);
9638       if (unlikely(err))
9639       {
9640         jam();
9641         break;
9642       }
9643       // Locate requested grandparent and request it to
9644       // T_BUFFER_ROW its result rows
9645       Ptr<TreeNode> parentPtr;
9646       m_treenode_pool.getPtr(parentPtr, treeNodePtr.p->m_parentPtrI);
9647       while (val--)
9648       {
9649         jam();
9650         ndbassert(parentPtr.p->m_parentPtrI != RNIL);
9651         m_treenode_pool.getPtr(parentPtr, parentPtr.p->m_parentPtrI);
9652         parentPtr.p->m_bits |= TreeNode::T_BUFFER_ROW;
9653         parentPtr.p->m_bits |= TreeNode::T_BUFFER_MAP;
9654       }
9655       Ptr<Request> requestPtr;
9656       m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
9657       requestPtr.p->m_bits |= Request::RT_BUFFERS;
9658       break;
9659     }
9660     default:
9661       err = DbspjErr::InvalidPattern;
9662       jam();
9663     }
9664 
9665     if (unlikely(err != 0))
9666     {
9667       jam();
9668       return err;
9669     }
9670   }
9671   return 0;
9672 }
9673 
9674 Uint32
parseDA(Build_context & ctx,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,DABuffer & tree,Uint32 treeBits,DABuffer & param,Uint32 paramBits)9675 Dbspj::parseDA(Build_context& ctx,
9676                Ptr<Request> requestPtr,
9677                Ptr<TreeNode> treeNodePtr,
9678                DABuffer& tree, Uint32 treeBits,
9679                DABuffer& param, Uint32 paramBits)
9680 {
9681   Uint32 err;
9682   Uint32 attrInfoPtrI = RNIL;
9683   Uint32 attrParamPtrI = RNIL;
9684 
9685   do
9686   {
9687     /**
9688      * Test execution terminated due to 'OutOfSectionMemory' which
9689      * may happen multiple places (eg. appendtosection, expand) below:
9690      * - 17050: Fail on parseDA at first call
9691      * - 17051: Fail on parseDA if 'isLeaf'
9692      * - 17052: Fail on parseDA if treeNode not root
9693      * - 17053: Fail on parseDA at a random node of the query tree
9694      */
9695     if (ERROR_INSERTED(17050) ||
9696        (ERROR_INSERTED(17051) && (treeNodePtr.p->isLeaf())) ||
9697        (ERROR_INSERTED(17052) && (treeNodePtr.p->m_parentPtrI != RNIL)) ||
9698        (ERROR_INSERTED(17053) && (rand() % 7) == 0))
9699     {
9700       jam();
9701       CLEAR_ERROR_INSERT_VALUE;
9702       ndbout_c("Injecting OutOfSectionMemory error at line %d file %s",
9703                 __LINE__,  __FILE__);
9704       err = DbspjErr::OutOfSectionMemory;
9705       break;
9706     }
9707 
9708     if (treeBits & DABits::NI_REPEAT_SCAN_RESULT)
9709     {
9710       jam();
9711       DEBUG("use REPEAT_SCAN_RESULT when returning results");
9712       requestPtr.p->m_bits |= Request::RT_REPEAT_SCAN_RESULT;
9713     } // DABits::NI_REPEAT_SCAN_RESULT
9714 
9715     if (treeBits & DABits::NI_INNER_JOIN)
9716     {
9717       jam();
9718       DEBUG("INNER_JOIN optimization used");
9719       treeNodePtr.p->m_bits |= TreeNode::T_INNER_JOIN;
9720     } // DABits::NI_INNER_JOIN
9721 
9722     // TODO: FirstMatch not implemented in SPJ block yet.
9723     // Later implementation will build on the BUFFER_ROW / _MATCH mechanisms
9724     // to eliminate already found matches from SCAN_NEXTREQ
9725     if (treeBits & DABits::NI_FIRST_MATCH)
9726     {
9727       jam();
9728       DEBUG("FIRST_MATCH optimization used");
9729       treeNodePtr.p->m_bits |= TreeNode::T_FIRST_MATCH;
9730     } // DABits::NI_FIRST_MATCH
9731 
9732     if (treeBits & DABits::NI_HAS_PARENT)
9733     {
9734       jam();
9735       DEBUG("NI_HAS_PARENT");
9736       /**
9737        * OPTIONAL PART 1:
9738        *
9739        * Parent nodes are stored first in optional part
9740        *   this is a list of 16-bit numbers refering to
9741        *   *earlier* nodes in tree
9742        *   the list stores length of list as first 16-bit
9743        */
9744       err = DbspjErr::InvalidTreeNodeSpecification;
9745       Uint32 dst[63];
9746       Uint32 cnt = unpackList(NDB_ARRAY_SIZE(dst), dst, tree);
9747       if (unlikely(cnt > NDB_ARRAY_SIZE(dst)))
9748       {
9749         jam();
9750         break;
9751       }
9752 
9753       if (unlikely(cnt!=1))
9754       {
9755         /**
9756          * Only a single parent supported for now, i.e only trees
9757          */
9758         jam();
9759         break;
9760       }
9761 
9762       err = 0;
9763       for (Uint32 i = 0; i<cnt; i++)
9764       {
9765         DEBUG("adding " << dst[i] << " as parent");
9766         Ptr<TreeNode> parentPtr = ctx.m_node_list[dst[i]];
9767         LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
9768         Local_dependency_map map(pool, parentPtr.p->m_child_nodes);
9769         if (unlikely(!map.append(&treeNodePtr.i, 1)))
9770         {
9771           err = DbspjErr::OutOfQueryMemory;
9772           jam();
9773           break;
9774         }
9775         treeNodePtr.p->m_parentPtrI = parentPtr.i;
9776       }
9777 
9778       if (unlikely(err != 0))
9779         break;
9780     } // DABits::NI_HAS_PARENT
9781 
9782     err = DbspjErr::InvalidTreeParametersSpecificationKeyParamBitsMissmatch;
9783     if (unlikely( ((treeBits  & DABits::NI_KEY_PARAMS)==0) !=
9784                   ((paramBits & DABits::PI_KEY_PARAMS)==0)))
9785     {
9786       jam();
9787       break;
9788     }
9789 
9790     if (treeBits & (DABits::NI_KEY_PARAMS
9791                     | DABits::NI_KEY_LINKED
9792                     | DABits::NI_KEY_CONSTS))
9793     {
9794       jam();
9795       DEBUG("NI_KEY_PARAMS | NI_KEY_LINKED | NI_KEY_CONSTS");
9796 
9797       /**
9798        * OPTIONAL PART 2:
9799        *
9800        * If keys are parametrized or linked
9801        *   DATA0[LO/HI] - Length of key pattern/#parameters to key
9802        */
9803       Uint32 len_cnt = * tree.ptr ++;
9804       Uint32 len = len_cnt & 0xFFFF; // length of pattern in words
9805       Uint32 cnt = len_cnt >> 16;    // no of parameters
9806 
9807       err = DbspjErr::InvalidTreeParametersSpecificationIncorrectKeyParamCount;
9808       if (unlikely( ((cnt==0) != ((treeBits & DABits::NI_KEY_PARAMS) == 0)) ||
9809                     ((cnt==0) != ((paramBits & DABits::PI_KEY_PARAMS) == 0))))
9810       {
9811         jam();
9812         break;
9813       }
9814 
9815       if (treeBits & DABits::NI_KEY_LINKED)
9816       {
9817         jam();
9818         LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
9819         Local_pattern_store pattern(pool, treeNodePtr.p->m_keyPattern);
9820 
9821         DEBUG("LINKED-KEY PATTERN w/ " << cnt << " PARAM values");
9822         /**
9823          * Expand pattern into a new pattern (with linked values)
9824          */
9825         err = expand(pattern, treeNodePtr, tree, len, param, cnt);
9826         if (unlikely(err != 0))
9827         {
9828           jam();
9829           break;
9830         }
9831         /**
9832          * This node constructs a new key for each send
9833          */
9834         treeNodePtr.p->m_bits |= TreeNode::T_KEYINFO_CONSTRUCTED;
9835       }
9836       else
9837       {
9838         jam();
9839         DEBUG("FIXED-KEY w/ " << cnt << " PARAM values");
9840         /**
9841          * Expand pattern directly into keyinfo
9842          *   This means a "fixed" key from here on
9843          */
9844         bool hasNull;
9845         Uint32 keyInfoPtrI = RNIL;
9846         err = expand(keyInfoPtrI, tree, len, param, cnt, hasNull);
9847         if (unlikely(err != 0))
9848         {
9849           jam();
9850           releaseSection(keyInfoPtrI);
9851           break;
9852         }
9853         if (unlikely(hasNull))
9854         {
9855           /* API should have elliminated requests w/ const-NULL keys */
9856           jam();
9857           DEBUG("BEWARE: FIXED-key contain NULL values");
9858           releaseSection(keyInfoPtrI);
9859 //        treeNodePtr.p->m_bits |= TreeNode::T_NULL_PRUNE;
9860 //        break;
9861           ndbabort();
9862         }
9863         treeNodePtr.p->m_send.m_keyInfoPtrI = keyInfoPtrI;
9864       }
9865       ndbassert(err == 0); // All errors should have been handled
9866     } // DABits::NI_KEY_...
9867 
9868     const Uint32 mask =
9869       DABits::NI_LINKED_ATTR | DABits::NI_ATTR_INTERPRET |
9870       DABits::NI_ATTR_LINKED | DABits::NI_ATTR_PARAMS;
9871 
9872     if (((treeBits & mask) | (paramBits & DABits::PI_ATTR_LIST)) != 0)
9873     {
9874       jam();
9875       /**
9876        * OPTIONAL PART 3: attrinfo handling
9877        * - NI_LINKED_ATTR - these are attributes to be passed to children
9878        * - PI_ATTR_LIST   - this is "user-columns" (passed as parameters)
9879 
9880        * - NI_ATTR_INTERPRET - tree contains interpreted program
9881        * - NI_ATTR_LINKED - means that the attr-info contains linked-values
9882        * - NI_ATTR_PARAMS - means that the attr-info is parameterized
9883        *   PI_ATTR_PARAMS - means that the parameters contains attr parameters
9884        *
9885        * IF NI_ATTR_INTERPRET
9886        *   DATA0[LO/HI] = Length of program / total #arguments to program
9887        *   DATA1..N     = Program
9888        *
9889        * IF NI_ATTR_PARAMS
9890        *   DATA0[LO/HI] = Length / #param
9891        *   DATA1..N     = PARAM-0...PARAM-M
9892        *
9893        * IF PI_ATTR_INTERPRET
9894        *   DATA0[LO/HI] = Length of program / Length of subroutine-part
9895        *   DATA1..N     = Program (scan filter)
9896        *
9897        * IF NI_ATTR_LINKED
9898        *   DATA0[LO/HI] = Length / #
9899        *
9900        *
9901        */
9902       Uint32 sections[5] = { 0, 0, 0, 0, 0 };
9903       Uint32 * sectionptrs = 0;
9904 
9905       bool interpreted =
9906         (treeBits & DABits::NI_ATTR_INTERPRET) ||
9907         (paramBits & DABits::PI_ATTR_INTERPRET) ||
9908         (treeNodePtr.p->m_bits & TreeNode::T_ATTR_INTERPRETED);
9909 
9910       if (interpreted)
9911       {
9912         /**
9913          * Add section headers for interpreted execution
9914          *   and create pointer so that they can be updated later
9915          */
9916         jam();
9917         err = DbspjErr::OutOfSectionMemory;
9918         if (unlikely(!appendToSection(attrInfoPtrI, sections, 5)))
9919         {
9920           jam();
9921           break;
9922         }
9923 
9924         SegmentedSectionPtr ptr;
9925         getSection(ptr, attrInfoPtrI);
9926         sectionptrs = ptr.p->theData;
9927 
9928         if (treeBits & DABits::NI_ATTR_INTERPRET)
9929         {
9930           jam();
9931 
9932           /**
9933            * Having two interpreter programs is an error.
9934            */
9935           err = DbspjErr::BothTreeAndParametersContainInterpretedProgram;
9936           if (unlikely(paramBits & DABits::PI_ATTR_INTERPRET))
9937           {
9938             jam();
9939             break;
9940           }
9941 
9942           treeNodePtr.p->m_bits |= TreeNode::T_ATTR_INTERPRETED;
9943           Uint32 len2 = * tree.ptr++;
9944           Uint32 len_prg = len2 & 0xFFFF; // Length of interpret program
9945           Uint32 len_pattern = len2 >> 16;// Length of attr param pattern
9946           err = DbspjErr::OutOfSectionMemory;
9947           if (unlikely(!appendToSection(attrInfoPtrI, tree.ptr, len_prg)))
9948           {
9949             jam();
9950             break;
9951           }
9952 
9953           tree.ptr += len_prg;
9954           sectionptrs[1] = len_prg; // size of interpret program
9955 
9956           Uint32 tmp = * tree.ptr ++; // attr-pattern header
9957           Uint32 cnt = tmp & 0xFFFF;
9958 
9959           if (treeBits & DABits::NI_ATTR_LINKED)
9960           {
9961             jam();
9962             /**
9963              * Expand pattern into a new pattern (with linked values)
9964              */
9965             LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena,
9966                                     m_dependency_map_pool);
9967             Local_pattern_store pattern(pool,treeNodePtr.p->m_attrParamPattern);
9968             err = expand(pattern, treeNodePtr, tree, len_pattern, param, cnt);
9969             if (unlikely(err))
9970             {
9971               jam();
9972               break;
9973             }
9974             /**
9975              * This node constructs a new attr-info for each send
9976              */
9977             treeNodePtr.p->m_bits |= TreeNode::T_ATTRINFO_CONSTRUCTED;
9978           }
9979           else
9980           {
9981             jam();
9982             /**
9983              * Expand pattern directly into attr-info param
9984              *   This means a "fixed" attr-info param from here on
9985              */
9986             bool hasNull;
9987             err = expand(attrParamPtrI, tree, len_pattern, param, cnt, hasNull);
9988             if (unlikely(err))
9989             {
9990               jam();
9991               break;
9992             }
9993 //          ndbrequire(!hasNull);
9994           }
9995         }
9996         else // if (treeBits & DABits::NI_ATTR_INTERPRET)
9997         {
9998           jam();
9999           /**
10000            * Only relevant for interpreted stuff
10001            */
10002           ndbrequire((treeBits & DABits::NI_ATTR_PARAMS) == 0);
10003           ndbrequire((paramBits & DABits::PI_ATTR_PARAMS) == 0);
10004           ndbrequire((treeBits & DABits::NI_ATTR_LINKED) == 0);
10005 
10006           treeNodePtr.p->m_bits |= TreeNode::T_ATTR_INTERPRETED;
10007 
10008           if (! (paramBits & DABits::PI_ATTR_INTERPRET))
10009           {
10010             jam();
10011 
10012             /**
10013              * Tree node has interpreted execution,
10014              *   but no interpreted program specified
10015              *   auto-add Exit_ok (i.e return each row)
10016              */
10017             Uint32 tmp = Interpreter::ExitOK();
10018             err = DbspjErr::OutOfSectionMemory;
10019             if (unlikely(!appendToSection(attrInfoPtrI, &tmp, 1)))
10020             {
10021               jam();
10022               break;
10023             }
10024             sectionptrs[1] = 1;
10025           }
10026         } // if (treeBits & DABits::NI_ATTR_INTERPRET)
10027       } // if (interpreted)
10028 
10029       if (paramBits & DABits::PI_ATTR_INTERPRET)
10030       {
10031         jam();
10032 
10033         /**
10034          * Add the interpreted code that represents the scan filter.
10035          */
10036         const Uint32 len2 = * param.ptr++;
10037         Uint32 program_len = len2 & 0xFFFF;
10038         Uint32 subroutine_len = len2 >> 16;
10039         err = DbspjErr::OutOfSectionMemory;
10040         if (unlikely(!appendToSection(attrInfoPtrI, param.ptr, program_len)))
10041         {
10042           jam();
10043           break;
10044         }
10045         /**
10046          * The interpreted code is added is in the "Interpreted execute region"
10047          * of the attrinfo (see Dbtup::interpreterStartLab() for details).
10048          * It will thus execute before reading the attributes that constitutes
10049          * the projections.
10050          */
10051         sectionptrs[1] = program_len;
10052         param.ptr += program_len;
10053 
10054         if (subroutine_len)
10055         {
10056           if (unlikely(!appendToSection(attrParamPtrI,
10057                                         param.ptr, subroutine_len)))
10058           {
10059             jam();
10060             break;
10061           }
10062           sectionptrs[4] = subroutine_len;
10063           param.ptr += subroutine_len;
10064         }
10065         treeNodePtr.p->m_bits |= TreeNode::T_ATTR_INTERPRETED;
10066       }
10067 
10068       Uint32 sum_read = 0;
10069       Uint32 dst[MAX_ATTRIBUTES_IN_TABLE + 2];
10070 
10071       if (paramBits & DABits::PI_ATTR_LIST)
10072       {
10073         jam();
10074         Uint32 len = * param.ptr++;
10075         DEBUG("PI_ATTR_LIST");
10076 
10077         treeNodePtr.p->m_bits |= TreeNode::T_USER_PROJECTION;
10078         err = DbspjErr::OutOfSectionMemory;
10079         if (!appendToSection(attrInfoPtrI, param.ptr, len))
10080         {
10081           jam();
10082           break;
10083         }
10084 
10085         param.ptr += len;
10086         sum_read += len;
10087 
10088         const NodeId API_node = refToNode(ctx.m_resultRef);
10089         const Uint32 API_version = getNodeInfo(API_node).m_version;
10090 
10091         /**
10092          * We have just added a 'USER_PROJECTION' which is the
10093          * result row to the SPJ-API. If we will also add a
10094          * projection of SPJ keys (NI_LINKED_ATTR), we need to
10095          * insert a FLUSH of the client results now, else the
10096          * FLUSH is skipped as we produced a single result
10097          * projection only. (to API client)
10098          *
10099          * However, for scan requests we will always need to FLUSH:
10100          * LqhKeyReq::tcBlockref need to refer this SPJ block as
10101          * it is used to send the required REF/CONF to SPJ. However,
10102          * tcBlockref is also used as the 'route' dest for TRANSID_AI_R,
10103          * which should be routed to the requesting TC block. Thus
10104          * we need the FLUSH which specifies its own RouteRef.
10105          *
10106          * Also need to have this under API-version control, as
10107          * older API versions assumed that all SPJ results were
10108          * returned as 'long' signals.
10109          */
10110         if (treeBits & DABits::NI_LINKED_ATTR ||
10111             requestPtr.p->isScan() ||
10112             !ndbd_spj_api_support_short_TRANSID_AI(API_version))
10113         {
10114           /**
10115            * Insert a FLUSH_AI of 'USER_PROJECTION' result (to client)
10116            * before 'LINKED_ATTR' results to SPJ is produced.
10117            */
10118           jam();
10119           Uint32 flush[4];
10120           flush[0] = AttributeHeader::FLUSH_AI << 16;
10121           flush[1] = ctx.m_resultRef;
10122           flush[2] = ctx.m_resultData;
10123           flush[3] = ctx.m_senderRef; // RouteRef
10124           if (!appendToSection(attrInfoPtrI, flush, 4))
10125           {
10126             jam();
10127             break;
10128           }
10129           sum_read += 4;
10130         }
10131       }
10132 
10133       if (treeBits & DABits::NI_LINKED_ATTR)
10134       {
10135         jam();
10136         DEBUG("NI_LINKED_ATTR");
10137         err = DbspjErr::InvalidTreeNodeSpecification;
10138         Uint32 cnt = unpackList(MAX_ATTRIBUTES_IN_TABLE, dst, tree);
10139         if (unlikely(cnt > MAX_ATTRIBUTES_IN_TABLE))
10140         {
10141           jam();
10142           break;
10143         }
10144 
10145         /**
10146          * AttributeHeader contains attrId in 16-higher bits
10147          */
10148         for (Uint32 i = 0; i<cnt; i++)
10149           dst[i] <<= 16;
10150 
10151         /**
10152          * Read correlation factor
10153          */
10154         dst[cnt++] = AttributeHeader::CORR_FACTOR32 << 16;
10155 
10156         err = DbspjErr::OutOfSectionMemory;
10157         if (!appendToSection(attrInfoPtrI, dst, cnt))
10158         {
10159           jam();
10160           break;
10161         }
10162         sum_read += cnt;
10163         treeNodePtr.p->m_bits |= TreeNode::T_EXPECT_TRANSID_AI;
10164 
10165         // Having a key projection for LINKED child, implies not-LEAF
10166         treeNodePtr.p->m_bits &= ~(Uint32)TreeNode::T_LEAF;
10167       }
10168       /**
10169        * If no LINKED_ATTR's including the CORR_FACTOR was requested by
10170        * the API, the SPJ-block does its own request of a CORR_FACTOR.
10171        * Will be used to keep track of whether a 'match' was found
10172        * for the requested parent row.
10173        */
10174       else if (requestPtr.p->isScan() &&
10175 	       (treeNodePtr.p->m_bits & TreeNode::T_INNER_JOIN))
10176       {
10177         jam();
10178         Uint32 cnt = 0;
10179         /**
10180          * Only read correlation factor
10181          */
10182         dst[cnt++] = AttributeHeader::CORR_FACTOR32 << 16;
10183 
10184         err = DbspjErr::OutOfSectionMemory;
10185         if (!appendToSection(attrInfoPtrI, dst, cnt))
10186         {
10187           jam();
10188           break;
10189         }
10190         sum_read += cnt;
10191         treeNodePtr.p->m_bits |= TreeNode::T_EXPECT_TRANSID_AI;
10192       }
10193 
10194       if (interpreted)
10195       {
10196         jam();
10197         /**
10198          * Let reads be performed *after* interpreted program
10199          *   i.e in "final read"-section
10200          */
10201         sectionptrs[3] = sum_read;
10202 
10203         if (attrParamPtrI != RNIL)
10204         {
10205           jam();
10206           ndbrequire(!(treeNodePtr.p->m_bits&TreeNode::T_ATTRINFO_CONSTRUCTED));
10207 
10208           SegmentedSectionPtr ptr;
10209           getSection(ptr, attrParamPtrI);
10210           {
10211             SectionReader r0(ptr, getSectionSegmentPool());
10212             err = appendReaderToSection(attrInfoPtrI, r0, ptr.sz);
10213             if (unlikely(err != 0))
10214             {
10215               jam();
10216               break;
10217             }
10218             sectionptrs[4] = ptr.sz;
10219           }
10220           releaseSection(attrParamPtrI);
10221           attrParamPtrI = RNIL;
10222         }
10223       }
10224 
10225       treeNodePtr.p->m_send.m_attrInfoPtrI = attrInfoPtrI;
10226       attrInfoPtrI = RNIL;
10227     } // if (((treeBits & mask) | (paramBits & DABits::PI_ATTR_LIST)) != 0)
10228 
10229     // Empty attrinfo would cause node crash.
10230     if (treeNodePtr.p->m_send.m_attrInfoPtrI == RNIL)
10231     {
10232       jam();
10233 
10234       // Add dummy interpreted program.
10235       Uint32 tmp = Interpreter::ExitOK();
10236       err = DbspjErr::OutOfSectionMemory;
10237       if (unlikely(!appendToSection(treeNodePtr.p->m_send.m_attrInfoPtrI, &tmp, 1)))
10238       {
10239         jam();
10240         break;
10241       }
10242     }
10243 
10244     return 0;
10245   } while (0);
10246 
10247   if (attrInfoPtrI != RNIL)
10248   {
10249     jam();
10250     releaseSection(attrInfoPtrI);
10251   }
10252 
10253   if (attrParamPtrI != RNIL)
10254   {
10255     jam();
10256     releaseSection(attrParamPtrI);
10257   }
10258 
10259   return err;
10260 }
10261 
10262 /**
10263  * END - MODULE COMMON PARSE/UNPACK
10264  */
10265 
10266 /**
10267  * Process a scan request for an ndb$info table. (These are used for monitoring
10268  * purposes and do not contain application data.)
10269  */
execDBINFO_SCANREQ(Signal * signal)10270 void Dbspj::execDBINFO_SCANREQ(Signal *signal)
10271 {
10272   DbinfoScanReq req= * CAST_PTR(DbinfoScanReq, &signal->theData[0]);
10273   const Ndbinfo::ScanCursor* cursor =
10274     CAST_CONSTPTR(Ndbinfo::ScanCursor, DbinfoScan::getCursorPtr(&req));
10275   Ndbinfo::Ratelimit rl;
10276 
10277   jamEntry();
10278 
10279   switch(req.tableId){
10280 
10281     // The SPJ block only implements the ndbinfo.counters table.
10282   case Ndbinfo::COUNTERS_TABLEID:
10283   {
10284     Ndbinfo::counter_entry counters[] = {
10285       { Ndbinfo::SPJ_READS_RECEIVED_COUNTER,
10286         c_Counters.get_counter(CI_READS_RECEIVED) },
10287       { Ndbinfo::SPJ_LOCAL_READS_SENT_COUNTER,
10288         c_Counters.get_counter(CI_LOCAL_READS_SENT) },
10289       { Ndbinfo::SPJ_REMOTE_READS_SENT_COUNTER,
10290         c_Counters.get_counter(CI_REMOTE_READS_SENT) },
10291       { Ndbinfo::SPJ_READS_NOT_FOUND_COUNTER,
10292         c_Counters.get_counter(CI_READS_NOT_FOUND) },
10293       { Ndbinfo::SPJ_TABLE_SCANS_RECEIVED_COUNTER,
10294         c_Counters.get_counter(CI_TABLE_SCANS_RECEIVED) },
10295       { Ndbinfo::SPJ_LOCAL_TABLE_SCANS_SENT_COUNTER,
10296         c_Counters.get_counter(CI_LOCAL_TABLE_SCANS_SENT) },
10297       { Ndbinfo::SPJ_RANGE_SCANS_RECEIVED_COUNTER,
10298         c_Counters.get_counter(CI_RANGE_SCANS_RECEIVED) },
10299       { Ndbinfo::SPJ_LOCAL_RANGE_SCANS_SENT_COUNTER,
10300         c_Counters.get_counter(CI_LOCAL_RANGE_SCANS_SENT) },
10301       { Ndbinfo::SPJ_REMOTE_RANGE_SCANS_SENT_COUNTER,
10302         c_Counters.get_counter(CI_REMOTE_RANGE_SCANS_SENT) },
10303       { Ndbinfo::SPJ_SCAN_BATCHES_RETURNED_COUNTER,
10304         c_Counters.get_counter(CI_SCAN_BATCHES_RETURNED) },
10305       { Ndbinfo::SPJ_SCAN_ROWS_RETURNED_COUNTER,
10306         c_Counters.get_counter(CI_SCAN_ROWS_RETURNED) },
10307       { Ndbinfo::SPJ_PRUNED_RANGE_SCANS_RECEIVED_COUNTER,
10308         c_Counters.get_counter(CI_PRUNED_RANGE_SCANS_RECEIVED) },
10309       { Ndbinfo::SPJ_CONST_PRUNED_RANGE_SCANS_RECEIVED_COUNTER,
10310         c_Counters.get_counter(CI_CONST_PRUNED_RANGE_SCANS_RECEIVED) }
10311     };
10312     const size_t num_counters = sizeof(counters) / sizeof(counters[0]);
10313 
10314     Uint32 i = cursor->data[0];
10315     const BlockNumber bn = blockToMain(number());
10316     while(i < num_counters)
10317     {
10318       jam();
10319       Ndbinfo::Row row(signal, req);
10320       row.write_uint32(getOwnNodeId());
10321       row.write_uint32(bn);           // block number
10322       row.write_uint32(instance());   // block instance
10323       row.write_uint32(counters[i].id);
10324 
10325       row.write_uint64(counters[i].val);
10326       ndbinfo_send_row(signal, req, row, rl);
10327       i++;
10328       if (rl.need_break(req))
10329       {
10330         jam();
10331         ndbinfo_send_scan_break(signal, req, rl, i);
10332         return;
10333       }
10334     }
10335     break;
10336   }
10337 
10338   default:
10339     break;
10340   }
10341 
10342   ndbinfo_send_scan_conf(signal, req, rl);
10343 } // Dbspj::execDBINFO_SCANREQ(Signal *signal)
10344 
10345 
10346 /**
10347  * Incremental calculation of standard deviation:
10348  *
10349  * Suppose that the data set is x1, x2,..., xn then for each xn
10350  * we can find an updated mean (M) and square of sums (S) as:
10351  *
10352  * M(1) = x(1), M(k) = M(k-1) + (x(k) - M(k-1)) / k
10353  * S(1) = 0, S(k) = S(k-1) + (x(k) - M(k-1)) * (x(k) - M(k))
10354  *
10355  * Source: http://mathcentral.uregina.ca/QQ/database/QQ.09.02/carlos1.html
10356  */
sample(double observation)10357 void Dbspj::IncrementalStatistics::sample(double observation)
10358 {
10359   // Prevent wrap-around
10360   if(m_noOfSamples < 0xffffffff)
10361   {
10362     m_noOfSamples++;
10363     const double delta = observation - m_mean;
10364     m_mean += delta/m_noOfSamples;
10365     m_sumSquare +=  delta * (observation - m_mean);
10366   }
10367 }
10368