1 /*
2 Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License, version 2.0,
6 as published by the Free Software Foundation.
7
8 This program is also distributed with certain software (including
9 but not limited to OpenSSL) that is licensed under separate terms,
10 as designated in a particular file or component or in included license
11 documentation. The authors of MySQL hereby grant you an additional
12 permission to link the program and your derivative works with the
13 separately licensed software that they have included with MySQL.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #define DBSPJ_C
26 #include "Dbspj.hpp"
27
28 #include <ndb_version.h>
29 #include <SectionReader.hpp>
30 #include <signaldata/LqhKey.hpp>
31 #include <signaldata/QueryTree.hpp>
32 #include <signaldata/TcKeyRef.hpp>
33 #include <signaldata/RouteOrd.hpp>
34 #include <signaldata/TransIdAI.hpp>
35 #include <signaldata/DiGetNodes.hpp>
36 #include <signaldata/DihScanTab.hpp>
37 #include <signaldata/AttrInfo.hpp>
38 #include <signaldata/CreateTab.hpp>
39 #include <signaldata/PrepDropTab.hpp>
40 #include <signaldata/DropTab.hpp>
41 #include <signaldata/AlterTab.hpp>
42 #include <signaldata/AlterTable.hpp>
43 #include <signaldata/DbspjErr.hpp>
44 #include <Interpreter.hpp>
45 #include <AttributeHeader.hpp>
46 #include <AttributeDescriptor.hpp>
47 #include <KeyDescriptor.hpp>
48 #include <md5_hash.hpp>
49 #include <signaldata/TcKeyConf.hpp>
50
51 #include <signaldata/NodeFailRep.hpp>
52 #include <signaldata/ReadNodesConf.hpp>
53 #include <signaldata/SignalDroppedRep.hpp>
54 #include <EventLogger.hpp>
55 #include <Bitmask.hpp>
56
57 #define JAM_FILE_ID 479
58
59 extern EventLogger* g_eventLogger;
60 extern Uint32 ErrorSignalReceive;
61 extern Uint32 ErrorMaxSegmentsToSeize;
62
63 #ifdef VM_TRACE
64 /**
65 * 12 bits are used to represent the 'parent-row-correlation-id'.
66 * Effectively limiting max rows in a batch.
67 */
68 static const Uint32 MaxCorrelationId = (1 << 12);
69
70 /**
71 * DEBUG options for different parts of SPJ block
72 * Comment out those part you don't want DEBUG'ed.
73 */
74 //#define DEBUG(x) ndbout << "DBSPJ: "<< x << endl
75 //#define DEBUG_DICT(x) ndbout << "DBSPJ: "<< x << endl
76 //#define DEBUG_LQHKEREQ
77 //#define DEBUG_SCAN_FRAGREQ
78 #endif
79
80 /**
81 * Provide empty defs for those DEBUGs which has to be defined.
82 */
83 #if !defined(DEBUG)
84 #define DEBUG(x)
85 #endif
86
87 #if !defined(DEBUG_DICT)
88 #define DEBUG_DICT(x)
89 #endif
90
91 #define DEBUG_CRASH() ndbassert(false)
92
93 const Ptr<Dbspj::TreeNode> Dbspj::NullTreeNodePtr(0, RNIL );
94 const Dbspj::RowRef Dbspj::NullRowRef = { RNIL, GLOBAL_PAGE_SIZE_WORDS, { 0 } };
95
96
execSIGNAL_DROPPED_REP(Signal * signal)97 void Dbspj::execSIGNAL_DROPPED_REP(Signal* signal)
98 {
99 /* An incoming signal was dropped, handle it.
100 * Dropped signal really means that we ran out of
101 * long signal buffering to store its sections.
102 */
103 jamEntry();
104
105 if (!assembleDroppedFragments(signal))
106 {
107 jam();
108 return;
109 }
110
111 const SignalDroppedRep* rep = (SignalDroppedRep*) &signal->theData[0];
112 const Uint32 originalGSN= rep->originalGsn;
113
114 DEBUG("SignalDroppedRep received for GSN " << originalGSN);
115
116 switch(originalGSN) {
117 case GSN_LQHKEYREQ: //TC -> SPJ
118 {
119 jam();
120 const LqhKeyReq * const truncatedLqhKeyReq =
121 reinterpret_cast<const LqhKeyReq*>(&rep->originalData[0]);
122
123 handle_early_lqhkey_ref(signal, truncatedLqhKeyReq,
124 DbspjErr::OutOfSectionMemory);
125 break;
126 }
127 case GSN_SCAN_FRAGREQ: //TC -> SPJ
128 {
129 jam();
130 /* Get information necessary to send SCAN_FRAGREF back to TC */
131 // TODO : Handle dropped signal fragments
132
133 const ScanFragReq * const truncatedScanFragReq =
134 reinterpret_cast<const ScanFragReq*>(&rep->originalData[0]);
135
136 handle_early_scanfrag_ref(signal, truncatedScanFragReq,
137 DbspjErr::OutOfSectionMemory);
138 break;
139 }
140 case GSN_TRANSID_AI: //TUP -> SPJ
141 {
142 jam();
143 const TransIdAI * const truncatedTransIdAI =
144 reinterpret_cast<const TransIdAI*>(&rep->originalData[0]);
145 const Uint32 ptrI = truncatedTransIdAI->connectPtr;
146
147 Ptr<TreeNode> treeNodePtr;
148 m_treenode_pool.getPtr(treeNodePtr, ptrI);
149 Ptr<Request> requestPtr;
150 m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
151
152 /**
153 * Register signal as arrived -> 'done' if this completed this treeNode
154 */
155 ndbassert(treeNodePtr.p->m_info&&treeNodePtr.p->m_info->m_countSignal);
156 (this->*(treeNodePtr.p->m_info->m_countSignal))(signal,
157 requestPtr,
158 treeNodePtr, 1);
159
160 abort(signal, requestPtr, DbspjErr::OutOfSectionMemory);
161 break;
162 }
163 default:
164 jam();
165 /* Don't expect dropped signals for other GSNs */
166 SimulatedBlock::execSIGNAL_DROPPED_REP(signal);
167 }
168
169 #ifdef ERROR_INSERT
170 if (ErrorSignalReceive == DBSPJ)
171 {
172 jam();
173 ErrorSignalReceive= 0;
174 }
175 #endif
176
177 return;
178 }
179
180 inline
181 Uint32
checkTableError(Uint32 schemaVersion) const182 Dbspj::TableRecord::checkTableError(Uint32 schemaVersion) const
183 {
184 DEBUG_DICT("Dbspj::TableRecord::checkTableError"
185 << ", m_flags: " << m_flags
186 << ", m_currentSchemaVersion: " << m_currentSchemaVersion
187 << ", check schemaVersion: " << schemaVersion);
188
189 if (!get_enabled())
190 return DbspjErr::NoSuchTable;
191 if (get_dropping())
192 return DbspjErr::DropTableInProgress;
193 if (table_version_major(schemaVersion) != table_version_major(m_currentSchemaVersion))
194 return DbspjErr::WrongSchemaVersion;
195
196 return 0;
197 }
198
199 // create table prepare
execTC_SCHVERREQ(Signal * signal)200 void Dbspj::execTC_SCHVERREQ(Signal* signal)
201 {
202 jamEntry();
203 if (! assembleFragments(signal)) {
204 jam();
205 return;
206 }
207 const TcSchVerReq* req = CAST_CONSTPTR(TcSchVerReq, signal->getDataPtr());
208 const Uint32 tableId = req->tableId;
209 const Uint32 senderRef = req->senderRef;
210 const Uint32 senderData = req->senderData;
211
212 DEBUG_DICT("Dbspj::execTC_SCHVERREQ"
213 << ", tableId: " << tableId
214 << ", version: " << req->tableVersion
215 );
216
217 TableRecordPtr tablePtr;
218 tablePtr.i = tableId;
219 ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
220
221 ndbrequire(tablePtr.p->get_prepared() == false);
222 ndbrequire(tablePtr.p->get_enabled() == false);
223 new (tablePtr.p) TableRecord(req->tableVersion);
224
225 if (req->readBackup)
226 {
227 jam();
228 tablePtr.p->m_flags |= TableRecord::TR_READ_BACKUP;
229 }
230
231 if (req->fullyReplicated)
232 {
233 jam();
234 tablePtr.p->m_flags |= TableRecord::TR_FULLY_REPLICATED;
235 }
236
237 /**
238 * NOTE: Even if there are more information, like
239 * 'tableType', 'noOfPrimaryKeys'etc available from
240 * TcSchVerReq, we do *not* store that in TableRecord.
241 * Instead this information is retrieved on demand from
242 * g_key_descriptor_pool where it is readily available.
243 * The 'contract' for consistency of this information is
244 * such that:
245 * 1) g_key_descriptor[ENTRY] will be populated *before*
246 * any blocks receiving CREATE_TAB_REQ (or equivalent).
247 * 2) g_key_descriptor[ENTRY] will be invalidated *after*
248 * all blocks sent DROP_TAB_CONF (commit)
249 * Thus, this info is consistent whenever required by SPJ.
250 */
251 TcSchVerConf * conf = (TcSchVerConf*)signal->getDataPtr();
252 conf->senderRef = reference();
253 conf->senderData = senderData;
254 sendSignal(senderRef, GSN_TC_SCHVERCONF, signal,
255 TcSchVerConf::SignalLength, JBB);
256 }//Dbspj::execTC_SCHVERREQ()
257
258 // create table commit
execTAB_COMMITREQ(Signal * signal)259 void Dbspj::execTAB_COMMITREQ(Signal* signal)
260 {
261 jamEntry();
262 const Uint32 senderData = signal->theData[0];
263 const Uint32 senderRef = signal->theData[1];
264 const Uint32 tableId = signal->theData[2];
265
266 DEBUG_DICT("Dbspj::execTAB_COMMITREQ"
267 << ", tableId: " << tableId
268 );
269
270 TableRecordPtr tablePtr;
271 tablePtr.i = tableId;
272 ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
273
274 ndbrequire(tablePtr.p->get_prepared() == true);
275 ndbrequire(tablePtr.p->get_enabled() == false);
276 tablePtr.p->set_enabled(true);
277 tablePtr.p->set_prepared(false);
278 tablePtr.p->set_dropping(false);
279
280 signal->theData[0] = senderData;
281 signal->theData[1] = reference();
282 signal->theData[2] = tableId;
283 sendSignal(senderRef, GSN_TAB_COMMITCONF, signal, 3, JBB);
284 }//Dbspj::execTAB_COMMITREQ
285
286 void
execPREP_DROP_TAB_REQ(Signal * signal)287 Dbspj::execPREP_DROP_TAB_REQ(Signal* signal)
288 {
289 jamEntry();
290
291 PrepDropTabReq* req = (PrepDropTabReq*)signal->getDataPtr();
292 const Uint32 tableId = req->tableId;
293 const Uint32 senderRef = req->senderRef;
294 const Uint32 senderData = req->senderData;
295
296 DEBUG_DICT("Dbspj::execPREP_DROP_TAB_REQ"
297 << ", tableId: " << tableId
298 );
299
300 TableRecordPtr tablePtr;
301 tablePtr.i = tableId;
302 ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
303
304 if (!tablePtr.p->get_enabled())
305 {
306 jam();
307 PrepDropTabRef* ref = (PrepDropTabRef*)signal->getDataPtrSend();
308 ref->senderRef = reference();
309 ref->senderData = senderData;
310 ref->tableId = tableId;
311 ref->errorCode = PrepDropTabRef::NoSuchTable;
312 sendSignal(senderRef, GSN_PREP_DROP_TAB_REF, signal,
313 PrepDropTabRef::SignalLength, JBB);
314 return;
315 }
316
317 if (tablePtr.p->get_dropping())
318 {
319 jam();
320 PrepDropTabRef* ref = (PrepDropTabRef*)signal->getDataPtrSend();
321 ref->senderRef = reference();
322 ref->senderData = senderData;
323 ref->tableId = tableId;
324 ref->errorCode = PrepDropTabRef::DropInProgress;
325 sendSignal(senderRef, GSN_PREP_DROP_TAB_REF, signal,
326 PrepDropTabRef::SignalLength, JBB);
327 return;
328 }
329
330 tablePtr.p->set_dropping(true);
331 tablePtr.p->set_prepared(false);
332
333 PrepDropTabConf* conf = (PrepDropTabConf*)signal->getDataPtrSend();
334 conf->tableId = tableId;
335 conf->senderRef = reference();
336 conf->senderData = senderData;
337 sendSignal(senderRef, GSN_PREP_DROP_TAB_CONF, signal,
338 PrepDropTabConf::SignalLength, JBB);
339 }//Dbspj::execPREP_DROP_TAB_REQ
340
341 void
execDROP_TAB_REQ(Signal * signal)342 Dbspj::execDROP_TAB_REQ(Signal* signal)
343 {
344 jamEntry();
345
346 const DropTabReq* req = (DropTabReq*)signal->getDataPtr();
347 const Uint32 tableId = req->tableId;
348 const Uint32 senderRef = req->senderRef;
349 const Uint32 senderData = req->senderData;
350 DropTabReq::RequestType rt = (DropTabReq::RequestType)req->requestType;
351
352 DEBUG_DICT("Dbspj::execDROP_TAB_REQ"
353 << ", tableId: " << tableId
354 );
355
356 TableRecordPtr tablePtr;
357 tablePtr.i = tableId;
358 ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
359
360 if (rt == DropTabReq::OnlineDropTab){
361 if (!tablePtr.p->get_enabled()){
362 jam();
363 DropTabRef* ref = (DropTabRef*)signal->getDataPtrSend();
364 ref->senderRef = reference();
365 ref->senderData = senderData;
366 ref->tableId = tableId;
367 ref->errorCode = DropTabRef::NoSuchTable;
368 sendSignal(senderRef, GSN_DROP_TAB_REF, signal,
369 DropTabRef::SignalLength, JBB);
370 return;
371 }
372 if (!tablePtr.p->get_dropping()){
373 jam();
374 DropTabRef* ref = (DropTabRef*)signal->getDataPtrSend();
375 ref->senderRef = reference();
376 ref->senderData = senderData;
377 ref->tableId = tableId;
378 ref->errorCode = DropTabRef::DropWoPrep;
379 sendSignal(senderRef, GSN_DROP_TAB_REF, signal,
380 DropTabRef::SignalLength, JBB);
381 return;
382 }
383 }
384
385 tablePtr.p->set_enabled(false);
386 tablePtr.p->set_prepared(false);
387 tablePtr.p->set_dropping(false);
388
389 DropTabConf * conf = (DropTabConf*)signal->getDataPtrSend();
390 conf->tableId = tableId;
391 conf->senderRef = reference();
392 conf->senderData = senderData;
393 sendSignal(senderRef, GSN_DROP_TAB_CONF, signal,
394 PrepDropTabConf::SignalLength, JBB);
395 }//Dbspj::execDROP_TAB_REQ
396
397 void
execALTER_TAB_REQ(Signal * signal)398 Dbspj::execALTER_TAB_REQ(Signal* signal)
399 {
400 jamEntry();
401
402 const AlterTabReq* req = (const AlterTabReq*)signal->getDataPtr();
403 const Uint32 tableId = req->tableId;
404 const Uint32 senderRef = req->senderRef;
405 const Uint32 senderData = req->senderData;
406 const Uint32 tableVersion = req->tableVersion;
407 const Uint32 newTableVersion = req->newTableVersion;
408 AlterTabReq::RequestType requestType =
409 (AlterTabReq::RequestType) req->requestType;
410 D("ALTER_TAB_REQ(SPJ)");
411
412 DEBUG_DICT("Dbspj::execALTER_TAB_REQ"
413 << ", tableId: " << tableId
414 << ", version: " << tableVersion << " --> " << newTableVersion
415 );
416
417 TableRecordPtr tablePtr;
418 tablePtr.i = tableId;
419 ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
420
421 switch (requestType) {
422 case AlterTabReq::AlterTablePrepare:
423 jam();
424 break;
425 case AlterTabReq::AlterTableRevert:
426 jam();
427 tablePtr.p->m_currentSchemaVersion = tableVersion;
428 break;
429 case AlterTabReq::AlterTableCommit:
430 jam();
431 tablePtr.p->m_currentSchemaVersion = newTableVersion;
432 if (AlterTableReq::getReadBackupFlag(req->changeMask))
433 {
434 /**
435 * We simply swap the flag, the preparatory work for this
436 * change is done in DBTC.
437 */
438 if ((tablePtr.p->m_flags & TableRecord::TR_READ_BACKUP) != 0)
439 {
440 jam();
441 /* Reset Read Backup flag */
442 tablePtr.p->m_flags &= (~(TableRecord::TR_READ_BACKUP));
443 }
444 else
445 {
446 jam();
447 /* Set Read Backup flag */
448 tablePtr.p->m_flags |= TableRecord::TR_READ_BACKUP;
449 }
450 }
451 break;
452 default:
453 ndbabort();
454 }
455
456 AlterTabConf* conf = (AlterTabConf*)signal->getDataPtrSend();
457 conf->senderRef = reference();
458 conf->senderData = senderData;
459 conf->connectPtr = RNIL;
460 sendSignal(senderRef, GSN_ALTER_TAB_CONF, signal,
461 AlterTabConf::SignalLength, JBB);
462 }//Dbspj::execALTER_TAB_REQ
463
464 /** A noop for now.*/
execREAD_CONFIG_REQ(Signal * signal)465 void Dbspj::execREAD_CONFIG_REQ(Signal* signal)
466 {
467 jamEntry();
468 const ReadConfigReq req =
469 *reinterpret_cast<const ReadConfigReq*>(signal->getDataPtr());
470
471 Pool_context pc;
472 pc.m_block = this;
473
474 DEBUG("execREAD_CONFIG_REQ");
475 DEBUG("sizeof(Request): " << sizeof(Request) <<
476 " sizeof(TreeNode): " << sizeof(TreeNode));
477
478 m_arenaAllocator.init(1024, RT_SPJ_ARENA_BLOCK, pc);
479 m_request_pool.arena_pool_init(&m_arenaAllocator, RT_SPJ_REQUEST, pc);
480 m_treenode_pool.arena_pool_init(&m_arenaAllocator, RT_SPJ_TREENODE, pc);
481 m_scanfraghandle_pool.arena_pool_init(&m_arenaAllocator, RT_SPJ_SCANFRAG, pc);
482 m_lookup_request_hash.setSize(16);
483 m_scan_request_hash.setSize(16);
484 void* ptr = m_ctx.m_mm.get_memroot();
485 m_page_pool.set((RowPage*)ptr, (Uint32)~0);
486
487 Record_info ri;
488 Dependency_map::createRecordInfo(ri, RT_SPJ_DATABUFFER);
489 m_dependency_map_pool.init(&m_arenaAllocator, ri, pc);
490
491 {
492 const ndb_mgm_configuration_iterator * p =
493 m_ctx.m_config.getOwnConfigIterator();
494 ndbrequire(p != 0);
495
496 ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_SPJ_TABLE, &c_tabrecFilesize));
497 }
498 m_tableRecord = (TableRecord*)allocRecord("TableRecord",
499 sizeof(TableRecord),
500 c_tabrecFilesize);
501
502 TableRecordPtr tablePtr;
503 for (tablePtr.i = 0; tablePtr.i < c_tabrecFilesize; tablePtr.i++) {
504 ptrAss(tablePtr, m_tableRecord);
505 new (tablePtr.p) TableRecord;
506 }//for
507
508 ReadConfigConf* const conf =
509 reinterpret_cast<ReadConfigConf*>(signal->getDataPtrSend());
510 conf->senderRef = reference();
511 conf->senderData = req.senderData;
512
513 sendSignal(req.senderRef, GSN_READ_CONFIG_CONF, signal,
514 ReadConfigConf::SignalLength, JBB);
515 }//Dbspj::execREAD_CONF_REQ()
516
517 static Uint32 f_STTOR_REF = 0;
518
execSTTOR(Signal * signal)519 void Dbspj::execSTTOR(Signal* signal)
520 {
521 //#define UNIT_TEST_DATABUFFER2
522
523 jamEntry();
524 /* START CASE */
525 const Uint16 tphase = signal->theData[1];
526 f_STTOR_REF = signal->getSendersBlockRef();
527
528 if (tphase == 1)
529 {
530 jam();
531 signal->theData[0] = 0;
532 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 1000, 1);
533 }
534
535 if (tphase == 4)
536 {
537 jam();
538
539 signal->theData[0] = reference();
540 sendSignal(NDBCNTR_REF, GSN_READ_NODESREQ, signal, 1, JBB);
541 return;
542 }
543
544 sendSTTORRY(signal);
545
546 #ifdef UNIT_TEST_DATABUFFER2
547 if (tphase == 120)
548 {
549 ndbout_c("basic test of ArenaPool / DataBuffer");
550
551 for (Uint32 i = 0; i<100; i++)
552 {
553 ArenaHead ah;
554 if (!m_arenaAllocator.seize(ah))
555 {
556 ndbout_c("Failed to allocate arena");
557 break;
558 }
559
560 ndbout_c("*** LOOP %u", i);
561 Uint32 sum = 0;
562 Dependency_map::Head head;
563 LocalArenaPool<DataBufferSegment<14> > pool(ah, m_dependency_map_pool);
564 for (Uint32 j = 0; j<100; j++)
565 {
566 Uint32 sz = rand() % 1000;
567 if (0)
568 ndbout_c("adding %u", sz);
569 Local_dependency_map list(pool, head);
570 for (Uint32 i = 0; i<sz; i++)
571 signal->theData[i] = sum + i;
572 list.append(signal->theData, sz);
573 sum += sz;
574 }
575
576 {
577 ndbrequire(head.getSize() == sum);
578 Local_dependency_map list(pool, head);
579 Dependency_map::ConstDataBufferIterator it;
580 Uint32 cnt = 0;
581 for (list.first(it); !it.isNull(); list.next(it))
582 {
583 ndbrequire(* it.data == cnt);
584 cnt++;
585 }
586
587 ndbrequire(cnt == sum);
588 }
589
590 Resource_limit rl;
591 if (m_ctx.m_mm.get_resource_limit(7, rl))
592 {
593 ndbout_c("Resource %d min: %d max: %d curr: %d",
594 7, rl.m_min, rl.m_max, rl.m_curr);
595 }
596
597 {
598 ndbout_c("release map");
599 Local_dependency_map list(pool, head);
600 list.release();
601 }
602
603 ndbout_c("release all");
604 m_arenaAllocator.release(ah);
605 ndbout_c("*** LOOP %u sum: %u", i, sum);
606 }
607 }
608 #endif
609 }//Dbspj::execSTTOR()
610
611 void
sendSTTORRY(Signal * signal)612 Dbspj::sendSTTORRY(Signal* signal)
613 {
614 signal->theData[0] = 0;
615 signal->theData[1] = 0; /* BLOCK CATEGORY */
616 signal->theData[2] = 0; /* SIGNAL VERSION NUMBER */
617 signal->theData[3] = 4;
618 #ifdef UNIT_TEST_DATABUFFER2
619 signal->theData[4] = 120; /* Start phase end*/
620 #else
621 signal->theData[4] = 255;
622 #endif
623 signal->theData[5] = 255;
624 sendSignal(f_STTOR_REF, GSN_STTORRY, signal, 6, JBB);
625 }
626
627 void
execREAD_NODESCONF(Signal * signal)628 Dbspj::execREAD_NODESCONF(Signal* signal)
629 {
630 jamEntry();
631
632 ReadNodesConf * const conf = (ReadNodesConf *)signal->getDataPtr();
633 {
634 ndbrequire(signal->getNoOfSections() == 1);
635 SegmentedSectionPtr ptr;
636 SectionHandle handle(this, signal);
637 handle.getSection(ptr, 0);
638 ndbrequire(ptr.sz == 5 * NdbNodeBitmask::Size);
639 copy((Uint32*)&conf->definedNodes.rep.data, ptr);
640 releaseSections(handle);
641 }
642
643 if (getNodeState().getNodeRestartInProgress())
644 {
645 jam();
646 c_alive_nodes = conf->startedNodes;
647 c_alive_nodes.set(getOwnNodeId());
648 }
649 else
650 {
651 jam();
652 c_alive_nodes = conf->startingNodes;
653 NdbNodeBitmask tmp = conf->startedNodes;
654 c_alive_nodes.bitOR(tmp);
655 }
656
657 for (Uint32 i = 0; i < MAX_NDB_NODES; i++)
658 {
659 m_location_domain_id[i] = 0;
660 }
661
662 ndb_mgm_configuration *p =
663 m_ctx.m_config.getClusterConfig();
664 ndb_mgm_configuration_iterator *p_iter =
665 ndb_mgm_create_configuration_iterator(p, CFG_SECTION_NODE);
666
667 for (ndb_mgm_first(p_iter);
668 ndb_mgm_valid(p_iter);
669 ndb_mgm_next(p_iter))
670 {
671 jam();
672 Uint32 location_domain_id = 0;
673 Uint32 nodeId = 0;
674 Uint32 nodeType = 0;
675 ndbrequire(!ndb_mgm_get_int_parameter(p_iter, CFG_NODE_ID, &nodeId) &&
676 nodeId != 0);
677 jamLine(Uint16(nodeId));
678 ndbrequire(!ndb_mgm_get_int_parameter(p_iter,
679 CFG_TYPE_OF_SECTION,
680 &nodeType));
681 ndbrequire(nodeId != 0);
682 if (nodeType != NODE_TYPE_DB)
683 {
684 jam();
685 continue;
686 }
687 ndbrequire(nodeId < MAX_NDB_NODES);
688 ndb_mgm_get_int_parameter(p_iter,
689 CFG_LOCATION_DOMAIN_ID,
690 &location_domain_id);
691 m_location_domain_id[nodeId] = location_domain_id;
692 }
693 ndb_mgm_destroy_iterator(p_iter);
694 sendSTTORRY(signal);
695 }
696
697 void
execINCL_NODEREQ(Signal * signal)698 Dbspj::execINCL_NODEREQ(Signal* signal)
699 {
700 jamEntry();
701 const Uint32 senderRef = signal->theData[0];
702 const Uint32 nodeId = signal->theData[1];
703
704 ndbrequire(!c_alive_nodes.get(nodeId));
705 c_alive_nodes.set(nodeId);
706
707 signal->theData[0] = nodeId;
708 signal->theData[1] = reference();
709 sendSignal(senderRef, GSN_INCL_NODECONF, signal, 2, JBB);
710 }
711
712 void
execNODE_FAILREP(Signal * signal)713 Dbspj::execNODE_FAILREP(Signal* signal)
714 {
715 jamEntry();
716
717 NodeFailRep * rep = (NodeFailRep*)signal->getDataPtr();
718 if(signal->getLength() == NodeFailRep::SignalLength)
719 {
720 ndbrequire(signal->getNoOfSections() == 1);
721 ndbrequire(getNodeInfo(refToNode(signal->getSendersBlockRef())).m_version);
722 SegmentedSectionPtr ptr;
723 SectionHandle handle(this, signal);
724 handle.getSection(ptr, 0);
725 memset(rep->theNodes, 0, sizeof(rep->theNodes));
726 copy(rep->theNodes, ptr);
727 releaseSections(handle);
728 }
729 else
730 {
731 memset(rep->theNodes + NdbNodeBitmask48::Size,
732 0,
733 _NDB_NBM_DIFF_BYTES);
734 }
735 NdbNodeBitmask failed;
736 failed.assign(NdbNodeBitmask::Size, rep->theNodes);
737
738 c_alive_nodes.bitANDC(failed);
739
740 /* Clean up possibly fragmented signals being received or sent */
741 for (Uint32 node = 1; node < MAX_NDB_NODES; node++)
742 {
743 if (failed.get(node))
744 {
745 jam();
746 simBlockNodeFailure(signal, node);
747 }//if
748 }//for
749
750 signal->theData[0] = 1;
751 signal->theData[1] = 0;
752 failed.copyto(NdbNodeBitmask::Size, signal->theData + 2);
753 LinearSectionPtr lsptr[3];
754 lsptr[0].p = signal->theData + 2;
755 lsptr[0].sz = failed.getPackedLengthInWords();
756 sendSignal(reference(), GSN_CONTINUEB, signal, 2,
757 JBB, lsptr, 1);
758 }
759
760 void
execAPI_FAILREQ(Signal * signal)761 Dbspj::execAPI_FAILREQ(Signal* signal)
762 {
763 jamEntry();
764 Uint32 failedApiNode = signal->theData[0];
765 Uint32 ref = signal->theData[1];
766
767 /**
768 * We only need to care about lookups
769 * as SCAN's are aborted by DBTC
770 *
771 * As SPJ does not receive / send fragmented signals
772 * directly to API nodes, simBlockNodeFailure()
773 * should not really be required - assert this.
774 */
775 Uint32 elementsCleaned = simBlockNodeFailure(signal, failedApiNode);
776 ndbassert(elementsCleaned == 0); // As SPJ has no fragmented API signals
777 (void) elementsCleaned; // Avoid compiler error
778
779 signal->theData[0] = failedApiNode;
780 signal->theData[1] = reference();
781 sendSignal(ref, GSN_API_FAILCONF, signal, 2, JBB);
782 }
783
784 void
execCONTINUEB(Signal * signal)785 Dbspj::execCONTINUEB(Signal* signal)
786 {
787 jamEntry();
788 switch(signal->theData[0]) {
789 case 0:
790 releaseGlobal(signal);
791 return;
792 case 1:
793 nodeFail_checkRequests(signal);
794 return;
795 case 2:
796 nodeFail_checkRequests(signal);
797 return;
798 case 3:
799 {
800 Ptr<TreeNode> treeNodePtr;
801 Ptr<Request> requestPtr;
802 m_treenode_pool.getPtr(treeNodePtr, signal->theData[1]);
803 m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
804 scanFrag_sendDihGetNodesReq(signal, requestPtr, treeNodePtr);
805 checkPrepareComplete(signal, requestPtr);
806 return;
807 }
808 }
809
810 ndbabort();
811 }
812
813 void
nodeFail_checkRequests(Signal * signal)814 Dbspj::nodeFail_checkRequests(Signal* signal)
815 {
816 jam();
817 const Uint32 type = signal->theData[0];
818 const Uint32 bucket = signal->theData[1];
819
820 NdbNodeBitmask failed;
821 ndbrequire(signal->getNoOfSections() == 1);
822
823 SegmentedSectionPtr ptr;
824 SectionHandle handle(this,signal);
825 handle.getSection(ptr, 0);
826 ndbrequire(ptr.sz <= NdbNodeBitmask::Size);
827 copy(failed.rep.data, ptr);
828 releaseSections(handle);
829
830 Request_iterator iter;
831 Request_hash * hash = NULL;
832 switch(type){
833 case 1:
834 hash = &m_lookup_request_hash;
835 break;
836 case 2:
837 hash = &m_scan_request_hash;
838 break;
839 default:
840 hash = NULL; //Silence compiler warning
841 ndbabort(); //Impossible, avoid warning
842 }
843 hash->next(bucket, iter);
844
845 const Uint32 RT_BREAK = 64;
846 for(Uint32 i = 0; (i<RT_BREAK || iter.bucket == bucket) &&
847 !iter.curr.isNull(); i++)
848 {
849 jam();
850
851 Ptr<Request> requestPtr = iter.curr;
852 hash->next(iter);
853 i += nodeFail(signal, requestPtr, failed);
854 }
855
856 if (!iter.curr.isNull())
857 {
858 jam();
859 signal->theData[0] = type;
860 signal->theData[1] = bucket;
861 failed.copyto(NdbNodeBitmask::Size, signal->theData + 2);
862 LinearSectionPtr lsptr[3];
863 lsptr[0].p = signal->theData + 2;
864 lsptr[0].sz = failed.getPackedLengthInWords();
865 sendSignal(reference(), GSN_CONTINUEB, signal, 2,
866 JBB, lsptr, 1);
867 }
868 else if (type == 1)
869 {
870 jam();
871 signal->theData[0] = 2;
872 signal->theData[1] = 0;
873 failed.copyto(NdbNodeBitmask::Size, signal->theData+2);
874 LinearSectionPtr lsptr[3];
875 lsptr[0].p = signal->theData + 2;
876 lsptr[0].sz = failed.getPackedLengthInWords();
877 sendSignal(reference(), GSN_CONTINUEB, signal, 2,
878 JBB, lsptr, 1);
879 }
880 else if (type == 2)
881 {
882 jam();
883 }
884 }
885
886 /**
887 * MODULE LQHKEYREQ
888 */
execLQHKEYREQ(Signal * signal)889 void Dbspj::execLQHKEYREQ(Signal* signal)
890 {
891 jamEntry();
892 if (unlikely(!assembleFragments(signal)))
893 {
894 jam();
895 return;
896 }
897
898 c_Counters.incr_counter(CI_READS_RECEIVED, 1);
899
900 if (ERROR_INSERTED(17014))
901 {
902 ndbrequire(refToNode(signal->getSendersBlockRef()) == getOwnNodeId());
903 }
904
905 const LqhKeyReq* req = reinterpret_cast<const LqhKeyReq*>(signal->getDataPtr());
906
907 /**
908 * #0 - KEYINFO contains key for first operation (used for hash in TC)
909 * #1 - ATTRINFO contains tree + parameters
910 * (unless StoredProcId is set, when only paramters are sent,
911 * but this is not yet implemented)
912 */
913 SegmentedSectionPtr attrPtr;
914 SectionHandle handle(this, signal);
915 handle.getSection(attrPtr, LqhKeyReq::AttrInfoSectionNum);
916 const Uint32 keyPtrI = handle.m_ptr[LqhKeyReq::KeyInfoSectionNum].i;
917
918 Uint32 err;
919 Ptr<Request> requestPtr(0, RNIL);
920 do
921 {
922 ArenaHead ah;
923 err = DbspjErr::OutOfQueryMemory;
924 if (unlikely(!m_arenaAllocator.seize(ah)))
925 break;
926
927 if (ERROR_INSERTED_CLEAR(17001))
928 {
929 jam();
930 ndbout_c("Injecting OutOfQueryMem error 17001 at line %d file %s",
931 __LINE__, __FILE__);
932 break;
933 }
934 if (unlikely(!m_request_pool.seize(ah, requestPtr)))
935 {
936 jam();
937 break;
938 }
939 new (requestPtr.p) Request(ah);
940 do_init(requestPtr.p, req, signal->getSendersBlockRef());
941
942 Uint32 len_cnt;
943
944 {
945 SectionReader r0(attrPtr, getSectionSegmentPool());
946
947 err = DbspjErr::ZeroLengthQueryTree;
948 if (unlikely(!r0.getWord(&len_cnt)))
949 break;
950 }
951
952 Uint32 len = QueryTree::getLength(len_cnt);
953 Uint32 cnt = QueryTree::getNodeCnt(len_cnt);
954
955 {
956 SectionReader treeReader(attrPtr, getSectionSegmentPool());
957 SectionReader paramReader(attrPtr, getSectionSegmentPool());
958 paramReader.step(len); // skip over tree to parameters
959
960 Build_context ctx;
961 ctx.m_resultRef = req->variableData[0];
962 ctx.m_savepointId = req->savePointId;
963 ctx.m_scanPrio = 1;
964 ctx.m_start_signal = signal;
965 ctx.m_senderRef = signal->getSendersBlockRef();
966
967 err = build(ctx, requestPtr, treeReader, paramReader);
968 if (unlikely(err != 0))
969 break;
970
971 /**
972 * Root TreeNode in Request takes ownership of keyPtr
973 * section when build has completed.
974 * We are done with attrPtr which are now released.
975 */
976 Ptr<TreeNode> rootNodePtr = ctx.m_node_list[0];
977 rootNodePtr.p->m_send.m_keyInfoPtrI = keyPtrI;
978 release(attrPtr);
979 handle.clear();
980 }
981
982 /**
983 * Store request in list(s)/hash(es)
984 */
985 store_lookup(requestPtr);
986
987 /**
988 * A query being shipped as a LQHKEYREQ may return at most a row
989 * per operation i.e be a (multi-)lookup
990 */
991 if (ERROR_INSERTED_CLEAR(17013) ||
992 unlikely(!requestPtr.p->isLookup() || requestPtr.p->m_node_cnt != cnt))
993 {
994 jam();
995 err = DbspjErr::InvalidRequest;
996 break;
997 }
998
999 prepare(signal, requestPtr);
1000 checkPrepareComplete(signal, requestPtr);
1001 return;
1002 } while (0);
1003
1004 /**
1005 * Error handling below,
1006 * 'err' should contain error code.
1007 */
1008 ndbassert(err != 0);
1009 if (!requestPtr.isNull())
1010 {
1011 jam();
1012 cleanup(requestPtr);
1013 }
1014 releaseSections(handle); // a NOOP, if we reached 'handle.clear()' above
1015 handle_early_lqhkey_ref(signal, req, err);
1016 }
1017
1018 void
do_init(Request * requestP,const LqhKeyReq * req,Uint32 senderRef)1019 Dbspj::do_init(Request* requestP, const LqhKeyReq* req, Uint32 senderRef)
1020 {
1021 requestP->m_bits = 0;
1022 requestP->m_errCode = 0;
1023 requestP->m_state = Request::RS_BUILDING;
1024 requestP->m_node_cnt = 0;
1025 requestP->m_cnt_active = 0;
1026 requestP->m_rows = 0;
1027 requestP->m_active_tree_nodes.clear();
1028 requestP->m_completed_tree_nodes.set();
1029 requestP->m_outstanding = 0;
1030 requestP->m_transId[0] = req->transId1;
1031 requestP->m_transId[1] = req->transId2;
1032 requestP->m_rootFragId = LqhKeyReq::getFragmentId(req->fragmentData);
1033 requestP->m_rootFragCnt = 1;
1034 bzero(requestP->m_lookup_node_data, sizeof(requestP->m_lookup_node_data));
1035 #ifdef SPJ_TRACE_TIME
1036 requestP->m_cnt_batches = 0;
1037 requestP->m_sum_rows = 0;
1038 requestP->m_sum_running = 0;
1039 requestP->m_sum_waiting = 0;
1040 requestP->m_save_time = NdbTick_getCurrentTicks();
1041 #endif
1042 const Uint32 reqInfo = req->requestInfo;
1043 Uint32 tmp = req->clientConnectPtr;
1044 if (LqhKeyReq::getDirtyFlag(reqInfo) &&
1045 LqhKeyReq::getOperation(reqInfo) == ZREAD)
1046 {
1047 jam();
1048
1049 ndbrequire(LqhKeyReq::getApplicationAddressFlag(reqInfo));
1050 //const Uint32 apiRef = lqhKeyReq->variableData[0];
1051 //const Uint32 apiOpRec = lqhKeyReq->variableData[1];
1052 tmp = req->variableData[1];
1053 requestP->m_senderData = tmp;
1054 requestP->m_senderRef = senderRef;
1055 }
1056 else
1057 {
1058 if (LqhKeyReq::getSameClientAndTcFlag(reqInfo) == 1)
1059 {
1060 if (LqhKeyReq::getApplicationAddressFlag(reqInfo))
1061 tmp = req->variableData[2];
1062 else
1063 tmp = req->variableData[0];
1064 }
1065 requestP->m_senderData = tmp;
1066 requestP->m_senderRef = senderRef;
1067 }
1068 requestP->m_rootResultData = tmp;
1069 }
1070
1071 void
store_lookup(Ptr<Request> requestPtr)1072 Dbspj::store_lookup(Ptr<Request> requestPtr)
1073 {
1074 ndbassert(requestPtr.p->isLookup());
1075 Ptr<Request> tmp;
1076 bool found = m_lookup_request_hash.find(tmp, *requestPtr.p);
1077 ndbrequire(found == false);
1078 m_lookup_request_hash.add(requestPtr);
1079 }
1080
1081 void
handle_early_lqhkey_ref(Signal * signal,const LqhKeyReq * lqhKeyReq,Uint32 err)1082 Dbspj::handle_early_lqhkey_ref(Signal* signal,
1083 const LqhKeyReq * lqhKeyReq,
1084 Uint32 err)
1085 {
1086 /**
1087 * Error path...
1088 */
1089 ndbrequire(err);
1090 const Uint32 reqInfo = lqhKeyReq->requestInfo;
1091 const Uint32 transid[2] = { lqhKeyReq->transId1, lqhKeyReq->transId2 };
1092
1093 if (LqhKeyReq::getDirtyFlag(reqInfo) &&
1094 LqhKeyReq::getOperation(reqInfo) == ZREAD)
1095 {
1096 jam();
1097 /* Dirty read sends TCKEYREF direct to client, and nothing to TC */
1098 ndbrequire(LqhKeyReq::getApplicationAddressFlag(reqInfo));
1099 const Uint32 apiRef = lqhKeyReq->variableData[0];
1100 const Uint32 apiOpRec = lqhKeyReq->variableData[1];
1101
1102 TcKeyRef* const tcKeyRef = reinterpret_cast<TcKeyRef*>(signal->getDataPtrSend());
1103
1104 tcKeyRef->connectPtr = apiOpRec;
1105 tcKeyRef->transId[0] = transid[0];
1106 tcKeyRef->transId[1] = transid[1];
1107 tcKeyRef->errorCode = err;
1108 sendTCKEYREF(signal, apiRef, signal->getSendersBlockRef());
1109 }
1110 else
1111 {
1112 jam();
1113 const Uint32 returnref = signal->getSendersBlockRef();
1114 const Uint32 clientPtr = lqhKeyReq->clientConnectPtr;
1115
1116 Uint32 TcOprec = clientPtr;
1117 if (LqhKeyReq::getSameClientAndTcFlag(reqInfo) == 1)
1118 {
1119 if (LqhKeyReq::getApplicationAddressFlag(reqInfo))
1120 TcOprec = lqhKeyReq->variableData[2];
1121 else
1122 TcOprec = lqhKeyReq->variableData[0];
1123 }
1124
1125 LqhKeyRef* const ref = reinterpret_cast<LqhKeyRef*>(signal->getDataPtrSend());
1126 ref->userRef = clientPtr;
1127 ref->connectPtr = TcOprec;
1128 ref->errorCode = err;
1129 ref->transId1 = transid[0];
1130 ref->transId2 = transid[1];
1131 sendSignal(returnref, GSN_LQHKEYREF, signal,
1132 LqhKeyRef::SignalLength, JBB);
1133 }
1134 }
1135
1136 void
sendTCKEYREF(Signal * signal,Uint32 ref,Uint32 routeRef)1137 Dbspj::sendTCKEYREF(Signal* signal, Uint32 ref, Uint32 routeRef)
1138 {
1139 const Uint32 nodeId = refToNode(ref);
1140 const bool connectedToNode = getNodeInfo(nodeId).m_connected;
1141
1142 if (likely(connectedToNode))
1143 {
1144 jam();
1145 sendSignal(ref, GSN_TCKEYREF, signal, TcKeyRef::SignalLength, JBB);
1146 }
1147 else
1148 {
1149 jam();
1150 memmove(signal->theData+25, signal->theData, 4*TcKeyRef::SignalLength);
1151 RouteOrd* ord = (RouteOrd*)signal->getDataPtrSend();
1152 ord->dstRef = ref;
1153 ord->srcRef = reference();
1154 ord->gsn = GSN_TCKEYREF;
1155 ord->cnt = 0;
1156 LinearSectionPtr ptr[3];
1157 ptr[0].p = signal->theData+25;
1158 ptr[0].sz = TcKeyRef::SignalLength;
1159 sendSignal(routeRef, GSN_ROUTE_ORD, signal, RouteOrd::SignalLength, JBB,
1160 ptr, 1);
1161 }
1162 }
1163
1164 void
sendTCKEYCONF(Signal * signal,Uint32 len,Uint32 ref,Uint32 routeRef)1165 Dbspj::sendTCKEYCONF(Signal* signal, Uint32 len, Uint32 ref, Uint32 routeRef)
1166 {
1167 const Uint32 nodeId = refToNode(ref);
1168 const bool connectedToNode = getNodeInfo(nodeId).m_connected;
1169
1170 if (likely(connectedToNode))
1171 {
1172 jam();
1173 sendSignal(ref, GSN_TCKEYCONF, signal, len, JBB);
1174 }
1175 else
1176 {
1177 jam();
1178 memmove(signal->theData+25, signal->theData, 4*len);
1179 RouteOrd* ord = (RouteOrd*)signal->getDataPtrSend();
1180 ord->dstRef = ref;
1181 ord->srcRef = reference();
1182 ord->gsn = GSN_TCKEYCONF;
1183 ord->cnt = 0;
1184 LinearSectionPtr ptr[3];
1185 ptr[0].p = signal->theData+25;
1186 ptr[0].sz = len;
1187 sendSignal(routeRef, GSN_ROUTE_ORD, signal, RouteOrd::SignalLength, JBB,
1188 ptr, 1);
1189 }
1190 }
1191
1192 /**
1193 * END - MODULE LQHKEYREQ
1194 */
1195
1196
1197 /**
1198 * MODULE SCAN_FRAGREQ
1199 */
1200 void
execSCAN_FRAGREQ(Signal * signal)1201 Dbspj::execSCAN_FRAGREQ(Signal* signal)
1202 {
1203 jamEntry();
1204
1205 /* Reassemble if the request was fragmented */
1206 if (!assembleFragments(signal))
1207 {
1208 jam();
1209 return;
1210 }
1211
1212 if (ERROR_INSERTED(17014))
1213 {
1214 ndbrequire(refToNode(signal->getSendersBlockRef()) == getOwnNodeId());
1215 }
1216
1217 const ScanFragReq * req = (ScanFragReq *)&signal->theData[0];
1218
1219 #ifdef DEBUG_SCAN_FRAGREQ
1220 ndbout_c("Incomming SCAN_FRAGREQ ");
1221 printSCAN_FRAGREQ(stdout, signal->getDataPtrSend(),
1222 ScanFragReq::SignalLength + 2,
1223 DBLQH);
1224 #endif
1225
1226 /**
1227 * #0 - ATTRINFO contains tree + parameters
1228 * (unless StoredProcId is set, when only paramters are sent,
1229 * but this is not yet implemented)
1230 * #1 - KEYINFO if first op is index scan - contains bounds for first scan
1231 * if first op is lookup - contains keyinfo for lookup
1232 */
1233 SectionHandle handle(this, signal);
1234 SegmentedSectionPtr attrPtr;
1235 handle.getSection(attrPtr, ScanFragReq::AttrInfoSectionNum);
1236
1237 Uint32 err;
1238 Ptr<Request> requestPtr(0, RNIL);
1239 do
1240 {
1241 ArenaHead ah;
1242 err = DbspjErr::OutOfQueryMemory;
1243 if (unlikely(!m_arenaAllocator.seize(ah)))
1244 break;
1245
1246 if (ERROR_INSERTED_CLEAR(17002))
1247 {
1248 ndbout_c("Injecting OutOfQueryMem error 17002 at line %d file %s",
1249 __LINE__, __FILE__);
1250 jam();
1251 break;
1252 }
1253 if (unlikely(!m_request_pool.seize(ah, requestPtr)))
1254 {
1255 jam();
1256 break;
1257 }
1258 new (requestPtr.p) Request(ah);
1259 do_init(requestPtr.p, req, signal->getSendersBlockRef());
1260
1261 Uint32 len_cnt;
1262 {
1263 SectionReader r0(attrPtr, getSectionSegmentPool());
1264 err = DbspjErr::ZeroLengthQueryTree;
1265 if (unlikely(!r0.getWord(&len_cnt)))
1266 break;
1267 }
1268
1269 Uint32 len = QueryTree::getLength(len_cnt);
1270 Uint32 cnt = QueryTree::getNodeCnt(len_cnt);
1271
1272 Uint32 sectionCnt = handle.m_cnt;
1273 Uint32 fragIdsPtrI = RNIL;
1274 if (ScanFragReq::getMultiFragFlag(req->requestInfo))
1275 {
1276 jam();
1277 sectionCnt--;
1278 fragIdsPtrI = handle.m_ptr[sectionCnt].i;
1279 SectionReader fragsReader(fragIdsPtrI, getSectionSegmentPool());
1280
1281 //Unpack into extended signal memory:
1282 const Uint32 fragCnt = signal->theData[25] = fragsReader.getSize();
1283 if (unlikely(!fragsReader.getWords(&signal->theData[26], fragCnt)))
1284 {
1285 jam();
1286 err = DbspjErr::InvalidRequest;
1287 break;
1288 }
1289 }
1290
1291 {
1292 SectionReader treeReader(attrPtr, getSectionSegmentPool());
1293 SectionReader paramReader(attrPtr, getSectionSegmentPool());
1294 paramReader.step(len); // skip over tree to parameters
1295
1296 Build_context ctx;
1297 ctx.m_resultRef = req->resultRef;
1298 ctx.m_scanPrio = ScanFragReq::getScanPrio(req->requestInfo);
1299 ctx.m_savepointId = req->savePointId;
1300 ctx.m_batch_size_rows = req->batch_size_rows;
1301 ctx.m_start_signal = signal;
1302 ctx.m_senderRef = signal->getSendersBlockRef();
1303
1304 err = build(ctx, requestPtr, treeReader, paramReader);
1305 if (unlikely(err != 0))
1306 break;
1307
1308 /**
1309 * Root TreeNode in Request takes ownership of keyPtr
1310 * section when build has completed.
1311 * We are done with attrPtr and MultiFrag-list which are
1312 * now released.
1313 */
1314 Ptr<TreeNode> rootNodePtr = ctx.m_node_list[0];
1315 if (sectionCnt > ScanFragReq::KeyInfoSectionNum)
1316 {
1317 jam();
1318 sectionCnt--;
1319 const Uint32 keyPtrI = handle.m_ptr[ScanFragReq::KeyInfoSectionNum].i;
1320 rootNodePtr.p->m_send.m_keyInfoPtrI = keyPtrI;
1321 }
1322 release(attrPtr);
1323 releaseSection(fragIdsPtrI); //MultiFrag list
1324 handle.clear();
1325 }
1326
1327 /**
1328 * Store request in list(s)/hash(es)
1329 */
1330 store_scan(requestPtr);
1331
1332 if (ERROR_INSERTED_CLEAR(17013) ||
1333 unlikely(!requestPtr.p->isScan() || requestPtr.p->m_node_cnt != cnt))
1334 {
1335 jam();
1336 err = DbspjErr::InvalidRequest;
1337 break;
1338 }
1339
1340 prepare(signal, requestPtr);
1341 checkPrepareComplete(signal, requestPtr);
1342 return;
1343 } while (0);
1344
1345 /**
1346 * Error handling below,
1347 * 'err' should contain error code.
1348 */
1349 ndbassert(err != 0);
1350 if (!requestPtr.isNull())
1351 {
1352 jam();
1353 cleanup(requestPtr);
1354 }
1355 releaseSections(handle); // a NOOP, if we reached 'handle.clear()' above
1356 handle_early_scanfrag_ref(signal, req, err);
1357 }
1358
1359 void
do_init(Request * requestP,const ScanFragReq * req,Uint32 senderRef)1360 Dbspj::do_init(Request* requestP, const ScanFragReq* req, Uint32 senderRef)
1361 {
1362 requestP->m_bits = Request::RT_SCAN;
1363 requestP->m_errCode = 0;
1364 requestP->m_state = Request::RS_BUILDING;
1365 requestP->m_node_cnt = 0;
1366 requestP->m_cnt_active = 0;
1367 requestP->m_rows = 0;
1368 requestP->m_active_tree_nodes.clear();
1369 requestP->m_completed_tree_nodes.set();
1370 requestP->m_outstanding = 0;
1371 requestP->m_senderRef = senderRef;
1372 requestP->m_senderData = req->senderData;
1373 requestP->m_transId[0] = req->transId1;
1374 requestP->m_transId[1] = req->transId2;
1375 requestP->m_rootResultData = req->resultData;
1376 requestP->m_rootFragId = req->fragmentNoKeyLen;
1377 requestP->m_rootFragCnt = 0; //Filled in later
1378 bzero(requestP->m_lookup_node_data, sizeof(requestP->m_lookup_node_data));
1379 #ifdef SPJ_TRACE_TIME
1380 requestP->m_cnt_batches = 0;
1381 requestP->m_sum_rows = 0;
1382 requestP->m_sum_running = 0;
1383 requestP->m_sum_waiting = 0;
1384 requestP->m_save_time = NdbTick_getCurrentTicks();
1385 #endif
1386 }
1387
1388 void
store_scan(Ptr<Request> requestPtr)1389 Dbspj::store_scan(Ptr<Request> requestPtr)
1390 {
1391 ndbassert(requestPtr.p->isScan());
1392 Ptr<Request> tmp;
1393 bool found = m_scan_request_hash.find(tmp, *requestPtr.p);
1394 ndbrequire(found == false);
1395 m_scan_request_hash.add(requestPtr);
1396 }
1397
1398 void
handle_early_scanfrag_ref(Signal * signal,const ScanFragReq * _req,Uint32 err)1399 Dbspj::handle_early_scanfrag_ref(Signal* signal,
1400 const ScanFragReq * _req,
1401 Uint32 err)
1402 {
1403 ScanFragReq req = *_req;
1404 Uint32 senderRef = signal->getSendersBlockRef();
1405
1406 ScanFragRef * ref = (ScanFragRef*)&signal->theData[0];
1407 ref->senderData = req.senderData;
1408 ref->transId1 = req.transId1;
1409 ref->transId2 = req.transId2;
1410 ref->errorCode = err;
1411 sendSignal(senderRef, GSN_SCAN_FRAGREF, signal,
1412 ScanFragRef::SignalLength, JBB);
1413 }
1414
1415 /**
1416 * END - MODULE SCAN_FRAGREQ
1417 */
1418
1419 /**
1420 * MODULE GENERIC
1421 */
1422 Uint32
build(Build_context & ctx,Ptr<Request> requestPtr,SectionReader & tree,SectionReader & param)1423 Dbspj::build(Build_context& ctx,
1424 Ptr<Request> requestPtr,
1425 SectionReader & tree,
1426 SectionReader & param)
1427 {
1428 Uint32 tmp0, tmp1;
1429 Uint32 err = DbspjErr::ZeroLengthQueryTree;
1430 ctx.m_cnt = 0;
1431 ctx.m_scan_cnt = 0;
1432
1433 tree.getWord(&tmp0);
1434 Uint32 loop = QueryTree::getNodeCnt(tmp0);
1435
1436 DEBUG("::build()");
1437 err = DbspjErr::InvalidTreeNodeCount;
1438 if (loop == 0 || loop > NDB_SPJ_MAX_TREE_NODES)
1439 {
1440 jam();
1441 goto error;
1442 }
1443
1444 while (ctx.m_cnt < loop)
1445 {
1446 DEBUG(" - loop " << ctx.m_cnt << " pos: " << tree.getPos().currPos);
1447 tree.peekWord(&tmp0);
1448 param.peekWord(&tmp1);
1449 Uint32 node_op = QueryNode::getOpType(tmp0);
1450 Uint32 node_len = QueryNode::getLength(tmp0);
1451 Uint32 param_op = QueryNodeParameters::getOpType(tmp1);
1452 Uint32 param_len = QueryNodeParameters::getLength(tmp1);
1453
1454 err = DbspjErr::QueryNodeTooBig;
1455 if (unlikely(node_len >= NDB_ARRAY_SIZE(m_buffer0)))
1456 {
1457 jam();
1458 goto error;
1459 }
1460
1461 err = DbspjErr::QueryNodeParametersTooBig;
1462 if (unlikely(param_len >= NDB_ARRAY_SIZE(m_buffer1)))
1463 {
1464 jam();
1465 goto error;
1466 }
1467
1468 err = DbspjErr::InvalidTreeNodeSpecification;
1469 if (unlikely(tree.getWords(m_buffer0, node_len) == false))
1470 {
1471 jam();
1472 goto error;
1473 }
1474
1475 err = DbspjErr::InvalidTreeParametersSpecification;
1476 if (unlikely(param.getWords(m_buffer1, param_len) == false))
1477 {
1478 jam();
1479 goto error;
1480 }
1481
1482 #if defined(DEBUG_LQHKEYREQ) || defined(DEBUG_SCAN_FRAGREQ)
1483 printf("node: ");
1484 for (Uint32 i = 0; i<node_len; i++)
1485 printf("0x%.8x ", m_buffer0[i]);
1486 printf("\n");
1487
1488 printf("param: ");
1489 for (Uint32 i = 0; i<param_len; i++)
1490 printf("0x%.8x ", m_buffer1[i]);
1491 printf("\n");
1492 #endif
1493
1494 err = DbspjErr::UnknowQueryOperation;
1495 if (unlikely(node_op != param_op))
1496 {
1497 jam();
1498 goto error;
1499 }
1500 if (ERROR_INSERTED_CLEAR(17006))
1501 {
1502 ndbout_c("Injecting UnknowQueryOperation error 17006 at line %d file %s",
1503 __LINE__, __FILE__);
1504 jam();
1505 goto error;
1506 }
1507
1508 const OpInfo* info = NULL;
1509 if (unlikely(node_op == QueryNode::QN_SCAN_FRAG_v1))
1510 {
1511 /**
1512 * Convert the deprecated SCAN_FRAG_v1 node+param to new SCAN_FRAG:
1513 * - The 'node' formats are identical, no conversion needed.
1514 * - The QN_ScanFragParameters has two additional 'batch_size' members.
1515 * In addition there is three unused Uint32 member for future use. (5)
1516 * Extend entire param block to make room for it, fill in from 'req'.
1517 *
1518 * {len, requestInfo, resultData}
1519 * -> {len, requestInfo, resultData,
1520 * batch_size_rows, batch_size_bytes, unused0-2}
1521 */
1522 jam();
1523 QN_ScanFragParameters_v1 *param_old = (QN_ScanFragParameters_v1*)m_buffer1;
1524 const Uint32 requestInfo = param_old->requestInfo;
1525 const Uint32 resultData = param_old->resultData;
1526
1527 if (unlikely(param_len+5 >= NDB_ARRAY_SIZE(m_buffer1)))
1528 {
1529 jam();
1530 err = DbspjErr::QueryNodeParametersTooBig;
1531 goto error;
1532 }
1533 QN_ScanFragParameters *param = (QN_ScanFragParameters*)m_buffer1;
1534 /* Moving data beyond 'NodeSize' after the space for new parameters */
1535 memmove(((Uint32*)param)+param->NodeSize,
1536 ((Uint32*)param_old)+param_old->NodeSize,
1537 (param_len-param_old->NodeSize) * sizeof(Uint32));
1538 param_len+=5;
1539
1540 param->requestInfo = requestInfo;
1541 param->resultData = resultData;
1542
1543 /* Calculate and fill in param 'batchSize' from request */
1544 Signal* signal = ctx.m_start_signal;
1545 const ScanFragReq* req = (const ScanFragReq*)(signal->getDataPtr());
1546 param->batch_size_rows = req->batch_size_rows;
1547 param->batch_size_bytes = req->batch_size_bytes;
1548 param->unused0 = 0;
1549 param->unused1 = 0;
1550 param->unused2 = 0;
1551
1552 /* Execute root scan with full parallelism - as SCAN_FRAG_v1 always did */
1553 param->requestInfo |= QN_ScanFragParameters::SFP_PARALLEL;
1554
1555 info = &Dbspj::g_ScanFragOpInfo;
1556 }
1557 else if (unlikely(node_op == QueryNode::QN_SCAN_INDEX_v1))
1558 {
1559 /**
1560 * Convert the deprecated SCAN_INDEX_v1 node+param to new SCAN_FRAG:
1561 * - The 'node' formats are identical, no conversion needed.
1562 * - The QN_ScanIndexParameters has split the single batchSize into
1563 * two seperate 'batch_size' members and introduced an additional
1564 * three unused Uint32 members for future use. (Total 4)
1565 * Extend entire param block to make room for it,
1566 * fill in from old batchSize argument.
1567 *
1568 * {len, requestInfo, batchSize, resultData}
1569 * -> {len, requestInfo, resultData,
1570 * batch_size_rows, batch_size_bytes, unused0-2}
1571 */
1572 jam();
1573 QN_ScanIndexParameters_v1 *param_old = (QN_ScanIndexParameters_v1*)m_buffer1;
1574 const Uint32 requestInfo = param_old->requestInfo;
1575 const Uint32 batchSize = param_old->batchSize;
1576 const Uint32 resultData = param_old->resultData;
1577
1578 if (unlikely(param_len+4 >= NDB_ARRAY_SIZE(m_buffer1)))
1579 {
1580 jam();
1581 err = DbspjErr::QueryNodeParametersTooBig;
1582 goto error;
1583 }
1584 QN_ScanFragParameters *param = (QN_ScanFragParameters*)m_buffer1;
1585 /* Moving data beyond 'NodeSize' after the space for new parameters */
1586 memmove(((Uint32*)param)+param->NodeSize,
1587 ((Uint32*)param_old)+param_old->NodeSize,
1588 (param_len-param_old->NodeSize) * sizeof(Uint32));
1589 param_len+=4;
1590
1591 param->requestInfo = requestInfo;
1592 param->resultData = resultData;
1593 param->batch_size_rows = batchSize & ~(0xFFFFFFFF << QN_ScanIndexParameters_v1::BatchRowBits);
1594 param->batch_size_bytes = batchSize >> QN_ScanIndexParameters_v1::BatchRowBits;
1595 param->unused0 = 0;
1596 param->unused1 = 0;
1597 param->unused2 = 0;
1598
1599 info = &Dbspj::g_ScanFragOpInfo;
1600 }
1601 else
1602 {
1603 info = getOpInfo(node_op);
1604 if (unlikely(info == NULL))
1605 {
1606 jam();
1607 goto error;
1608 }
1609 }
1610
1611 QueryNode* qn = (QueryNode*)m_buffer0;
1612 QueryNodeParameters * qp = (QueryNodeParameters*)m_buffer1;
1613 qn->len = node_len;
1614 qp->len = param_len;
1615 err = (this->*(info->m_build))(ctx, requestPtr, qn, qp);
1616 if (unlikely(err != 0))
1617 {
1618 jam();
1619 goto error;
1620 }
1621
1622 /**
1623 * only first node gets access to signal
1624 */
1625 ctx.m_start_signal = NULL;
1626
1627 ndbrequire(ctx.m_cnt < NDB_ARRAY_SIZE(ctx.m_node_list));
1628 ctx.m_cnt++;
1629 }
1630 requestPtr.p->m_node_cnt = ctx.m_cnt;
1631
1632 if (ctx.m_scan_cnt > 1)
1633 {
1634 jam();
1635 requestPtr.p->m_bits |= Request::RT_MULTI_SCAN;
1636 }
1637
1638 // Set up the order of execution plan
1639 buildExecPlan(requestPtr);
1640
1641 // Construct RowBuffers where required
1642 err = initRowBuffers(requestPtr);
1643 if (unlikely(err != 0))
1644 {
1645 jam();
1646 goto error;
1647 }
1648
1649 return 0;
1650
1651 error:
1652 jam();
1653 return err;
1654 }
1655
1656 /**
1657 * initRowBuffers will decide row-buffering strategy, and init
1658 * the RowBuffers where required.
1659 */
1660 Uint32
initRowBuffers(Ptr<Request> requestPtr)1661 Dbspj::initRowBuffers(Ptr<Request> requestPtr)
1662 {
1663 jam();
1664
1665 /**
1666 * Init BUFFERS iff Request has to buffer any rows/matches
1667 */
1668 if (requestPtr.p->m_bits & Request::RT_BUFFERS)
1669 {
1670 jam();
1671
1672 /**
1673 * Iff, multi-scan is non-bushy (normal case)
1674 * we don't strictly need BUFFER_VAR for RT_BUFFERS
1675 * but could instead pop-row stack frame,
1676 * however this is not implemented...
1677 *
1678 * so, currently use BUFFER_VAR if 'RT_MULTI_SCAN'
1679 *
1680 * NOTE: This should easily be solvable by having a
1681 * RowBuffer for each TreeNode instead
1682 */
1683 if (requestPtr.p->m_bits & Request::RT_MULTI_SCAN)
1684 {
1685 jam();
1686 requestPtr.p->m_rowBuffer.init(BUFFER_VAR);
1687 }
1688 else
1689 {
1690 jam();
1691 requestPtr.p->m_rowBuffer.init(BUFFER_STACK);
1692 }
1693
1694 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
1695 Ptr<TreeNode> treeNodePtr;
1696 for (list.first(treeNodePtr); !treeNodePtr.isNull(); list.next(treeNodePtr))
1697 {
1698 jam();
1699 ndbassert(treeNodePtr.p->m_batch_size > 0);
1700 /**
1701 * Construct a List or Map RowCollection for those TreeNodes
1702 * requiring rows to be buffered.
1703 */
1704 if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MAP)
1705 {
1706 jam();
1707 treeNodePtr.p->m_rows.construct (RowCollection::COLLECTION_MAP,
1708 requestPtr.p->m_rowBuffer,
1709 treeNodePtr.p->m_batch_size);
1710 }
1711 else if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ANY)
1712 {
1713 jam();
1714 treeNodePtr.p->m_rows.construct (RowCollection::COLLECTION_LIST,
1715 requestPtr.p->m_rowBuffer,
1716 treeNodePtr.p->m_batch_size);
1717 }
1718 }
1719 }
1720
1721 return 0;
1722 } // Dbspj::initRowBuffers
1723
1724
1725 /**
1726 * setupAncestors():
1727 *
1728 * Complete the query tree topology as given by the SPJ API:
1729 *
1730 * Fill in the m_ancestors bitMask, and set the referrence to
1731 * our closest scanAncestor in each TreeNode. Also set
1732 * the 'm_coverage' of each TreeNode.
1733 */
1734 void
setupAncestors(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 scanAncestorPtrI)1735 Dbspj::setupAncestors(Ptr<Request> requestPtr,
1736 Ptr<TreeNode> treeNodePtr,
1737 Uint32 scanAncestorPtrI)
1738 {
1739 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
1740 Local_dependency_map const childList(pool, treeNodePtr.p->m_child_nodes);
1741 Dependency_map::ConstDataBufferIterator it;
1742
1743 treeNodePtr.p->m_scanAncestorPtrI = scanAncestorPtrI;
1744 if (treeNodePtr.p->isScan())
1745 {
1746 scanAncestorPtrI = treeNodePtr.i;
1747 }
1748
1749 for (childList.first(it); !it.isNull(); childList.next(it))
1750 {
1751 jam();
1752 Ptr<TreeNode> childPtr;
1753 m_treenode_pool.getPtr(childPtr, *it.data);
1754
1755 childPtr.p->m_ancestors = treeNodePtr.p->m_ancestors;
1756 childPtr.p->m_ancestors.set(treeNodePtr.p->m_node_no);
1757
1758 setupAncestors(requestPtr, childPtr, scanAncestorPtrI);
1759
1760 treeNodePtr.p->m_coverage.bitOR(childPtr.p->m_coverage);
1761 }
1762 treeNodePtr.p->m_coverage.set(treeNodePtr.p->m_node_no);
1763 }
1764
1765
1766 /**
1767 * buildExecPlan()
1768 *
1769 * Decides the order/pace in which the different TreeNodes should
1770 * be executed. We basically choose between two strategies:
1771 *
1772 * Lookup-queries returns at most a single row from each
1773 * TreeNode in the SPJ-request. We believe these to impose
1774 * a relatively low CPU load on the system. We try to reduce
1775 * the elapsed execution time for these requests by
1776 * submitting as many of the LQHKEYREQ's as possible in parallel.
1777 * Thereby also taking advantage of the datanode parallelism.
1778 *
1779 * On the other hand, scan queries has the potential for returning
1780 * huge result sets. Furthermore, the root scan operation will
1781 * result is SPJ sub requests being sent to all datanodes. Thus
1782 * the datanode parallelism is utilized without executing
1783 * the SPJ requests TreeNodes in parallel. For such queries
1784 * we will execute INNER-joined TreeNodes in sequence, wherever
1785 * possible taking advantage of that we can skip further operations
1786 * on rows where preceeding matches were not found.
1787 *
1788 * Note that prior to introducing INNER-join handling in SPJ,
1789 * all queries effectively were executed with the most parallel
1790 * execution plan.
1791 */
1792 Uint32
buildExecPlan(Ptr<Request> requestPtr)1793 Dbspj::buildExecPlan(Ptr<Request> requestPtr)
1794 {
1795 Ptr<TreeNode> treeRootPtr;
1796 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
1797 list.first(treeRootPtr);
1798
1799 /**
1800 * Brute force solution to ensure that all rows in
1801 * batch are sorted if requested:
1802 *
1803 * In a scan-scan (MULTI_SCAN) request the result is effectively
1804 * generated as a cross product between the scans. If the child-scans
1805 * batches need another NEXTREQ to retrieve remaining rows, the parent
1806 * scans result rows will effectively be repeated together with the new
1807 * rows from the child scans.
1808 * By restricting the parent scan to a batch size of one row, the
1809 * parent rows will still be sorted, even if multiple child batches
1810 * has to be fetched.
1811 */
1812 if (treeRootPtr.p->m_bits & TreeNode::T_SORTED_ORDER &&
1813 requestPtr.p->m_bits & Request::RT_MULTI_SCAN)
1814 {
1815 jam();
1816 ndbassert(treeRootPtr.p->m_bits & TreeNode::T_SCAN_PARALLEL);
1817 ScanFragData& data = treeRootPtr.p->m_scanFrag_data;
1818 ScanFragReq* const dst = reinterpret_cast<ScanFragReq*>(data.m_scanFragReq);
1819 dst->batch_size_rows = 1;
1820 }
1821
1822 setupAncestors(requestPtr, treeRootPtr, RNIL);
1823
1824 if (requestPtr.p->isScan())
1825 {
1826 const Uint32 err = planSequentialExec(requestPtr, treeRootPtr,
1827 NullTreeNodePtr, NullTreeNodePtr);
1828 if (unlikely(err))
1829 return err;
1830 }
1831 else
1832 {
1833 const Uint32 err = planParallelExec(requestPtr, treeRootPtr);
1834 if (unlikely(err))
1835 return err;
1836 }
1837
1838 #ifdef VM_TRACE
1839 DEBUG("Execution plan, TreeNode execution order:");
1840 dumpExecPlan(requestPtr, treeRootPtr);
1841 #endif
1842
1843 return 0;
1844 } // buildExecPlan()
1845
1846
1847 /**
1848 * planParallelExec():
1849 *
1850 * Set up the most parallelized execution plan for the query.
1851 * This happens to be the same query topology as represented by the
1852 * child / parent references represented in SPJ request from the API.
1853 * So we could simply copy the child / ancestor dependencies as
1854 * the final order of execution.
1855 *
1856 * For such an execution plan we may execute all child-TreeNodes in
1857 * parallel - Even if there are non-matching child rows which will
1858 * eventually result in both the parent row, and all adjacent child rows
1859 * to be eliminated from a final inner-joined result set.
1860 *
1861 * Such a join plan is most suited for a query processing relatively few
1862 * rows, where the overhead of returning rows which are later eliminated
1863 * is low. The possible advantage if this query plan is a lower elapsed time
1864 * for the query execution, possible at the cost of some higher CPU usage.
1865 */
1866 Uint32
planParallelExec(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)1867 Dbspj::planParallelExec(Ptr<Request> requestPtr,
1868 Ptr<TreeNode> treeNodePtr)
1869 {
1870 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
1871 Local_dependency_map child(pool, treeNodePtr.p->m_child_nodes);
1872 Local_dependency_map execList(pool, treeNodePtr.p->m_next_nodes);
1873 Dependency_map::ConstDataBufferIterator it;
1874
1875 treeNodePtr.p->m_predecessors = treeNodePtr.p->m_ancestors;
1876 treeNodePtr.p->m_dependencies = treeNodePtr.p->m_ancestors;
1877
1878 for (child.first(it); !it.isNull(); child.next(it))
1879 {
1880 Ptr<TreeNode> childPtr;
1881 m_treenode_pool.getPtr(childPtr, *it.data);
1882 if (unlikely(!execList.append(&childPtr.i, 1)))
1883 {
1884 jam();
1885 return DbspjErr::OutOfQueryMemory;
1886 }
1887
1888 const Uint32 err = planParallelExec(requestPtr, childPtr);
1889 if (unlikely(err))
1890 return err;
1891
1892 treeNodePtr.p->m_coverage.bitOR(childPtr.p->m_coverage);
1893 }
1894
1895 return 0;
1896 } // Dbspj::planParallelExec
1897
1898
1899 /**
1900 * planSequentialExec()
1901 *
1902 * Build an execution plan where INNER-joined TreeNodes are executed in
1903 * sequence, such that further evaluation of not matching rows could be
1904 * skipped as early as possible.
1905 *
1906 * Steps:
1907 *
1908 * 1)
1909 * Each 'branch' has the property that it starts with either a scan-TreeNode,
1910 * or an outer joined (lookup-) TreeNode. Any INNER-joined lookup-nodes having
1911 * this TreeNode as a (grand-)parent, is also a member of the branch.
1912 *
1913 * Such a 'branch' of INNER-joined lookups has the property that an EQ-match
1914 * has to be found from all its TreeNodes in order for any of the related
1915 * rows to be part of the joined result set. Thus, during execution we can
1916 * skip any further child lookups as soon as a non-match is found. This is
1917 * represented in the execution plan by appending the INNER-joined lookups
1918 * in a sequence.
1919 *
1920 * Note that we are 'greedy' in appending these INNER-joined lookups,
1921 * such that a lookup-TreeNode may effectively be executed prior to a
1922 * scan-TreeNode, even if the scan is located before the lookup in the
1923 * 'm_nodes' list produced by the SPJ-API. This is intentional as a
1924 * potential non-INNER-joined lookup row would eliminate the need for
1925 * executing the much more expensive (index-)scan operation.
1926 *
1927 * 2)
1928 * Recursively append a *single* INNER-joined scan-*branch* after the
1929 * end of the branch from 1). As it is called recursively, the scan
1930 * branch will append further lookup-nodes which depended on this scan-node,
1931 * and finaly append any remaining INNER-joined scan branches.
1932 *
1933 * Note1 that due to old legacy in the SPJ-API protocol, all scan nodes
1934 * has to be executed in order relative to each other. (Explains the 'single'
1935 * mentioned above)
1936 *
1937 * Note2: After the two steps above has completed, including the recursive call
1938 * handling the INNER-joined scan, all INNER-joined TreeNodes to be joined with
1939 * this 'branch' have been added to the exec plan.
1940 *
1941 * Note3: Below we use the term 'non-INNER-joined', instead of 'OUTER-joined'.
1942 * This is due to SPJ-API protocol compatability, where we previously didn't
1943 * tag the TreeNodes as being INNER-joined or not. Thus when receiving a SPJ
1944 * request from an API client, we can't tell for sure whether the TreeNode
1945 * is outer joined, or if the (old) client simply didn't specify INNER-joins.
1946 * Thus all we know is that nodes are 'non-INNER-joined'.
1947 *
1948 * Also note that for any request from such an old API client, there will
1949 * not be appended any 'sequential' TreeNodes to the exec plan in 1) and 2)
1950 * above. Only steps 3) and 4) below will effectively be used, which will
1951 * (intentionaly) result in a parallelized query plan, identical to what
1952 * it used to be prior to introducing these INNER-join optimizations.
1953 *
1954 * 3)
1955 * Recursively append all non-INNER-joined lookup branches to be executed
1956 * after the sequence of INNER-joined-lookups (from 1). Note that these
1957 * branches are executed in sequence in a left -> right order, such
1958 * that when the 'left' branch is completed, we 'RESUME' into the 'right'
1959 * branch. This is done in order to avoid overflowing the job buffers
1960 * due to too many LQHKEYREQ-signals being sent at once.
1961 * The 'nextBranchPtr' is set up by this step as the 'right' lookup branch
1962 * to RESUME. (See appendTreeNode() for more about RESUME handling)
1963 *
1964 * 4)
1965 * Recursively append all non-INNER-joined scan branches to be executed
1966 * in *parallel* after the sequence of INNER-joined-lookups (from 1).
1967 * As we do not really handle OUTER-joined scans (yet), this is only
1968 * in effect when we get a SPJ request from an old-API, which do not
1969 * specify INNER-join for a scan-TreeNode. Thus the old type 'submit
1970 * scan in parallel'-plan will be produced.
1971 * For a client using the updated SPJ-API, all scans will be handled in 2)
1972 *
1973 */
1974 Uint32
planSequentialExec(Ptr<Request> requestPtr,const Ptr<TreeNode> branchPtr,Ptr<TreeNode> prevExecPtr,const Ptr<TreeNode> nextBranchPtr)1975 Dbspj::planSequentialExec(Ptr<Request> requestPtr,
1976 const Ptr<TreeNode> branchPtr,
1977 Ptr<TreeNode> prevExecPtr,
1978 const Ptr<TreeNode> nextBranchPtr)
1979 {
1980 DEBUG("planSequentialExec, start branch at treeNode no: " << branchPtr.p->m_node_no);
1981
1982 // Append head of branch to be executed after 'prevExecPtr'
1983 const Uint32 err = appendTreeNode(requestPtr, branchPtr, prevExecPtr, nextBranchPtr);
1984 if (unlikely(err))
1985 return err;
1986
1987 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
1988 TreeNodeBitMask predecessors(branchPtr.p->m_predecessors);
1989 predecessors.set(branchPtr.p->m_node_no);
1990
1991 /**
1992 * 1) Append all INNER-joined lookups to the 'plan' to be executed in sequence.
1993 * Maintain the set of 'predecessor' TreeNodes which are already executed.
1994 * Don't append TreeNodes where its ancestors are not part of the 'plan'
1995 */
1996 Ptr<TreeNode> treeNodePtr(branchPtr);
1997 prevExecPtr = treeNodePtr;
1998 while (list.next(treeNodePtr))
1999 {
2000 if (treeNodePtr.p->m_predecessors.isclear() &&
2001 predecessors.contains(treeNodePtr.p->m_ancestors) &&
2002 treeNodePtr.p->m_bits & TreeNode::T_INNER_JOIN &&
2003 treeNodePtr.p->isLookup())
2004 {
2005
2006 DEBUG("planSequentialExec, append INNER-join lookup treeNode: "
2007 << treeNodePtr.p->m_node_no
2008 << ", to branch at: " << branchPtr.p->m_node_no
2009 << ", as 'descendant' of node: " << prevExecPtr.p->m_node_no);
2010
2011 // Add INNER-joined lookup treeNode to the join plan:
2012 const Uint32 err = appendTreeNode(requestPtr, treeNodePtr, prevExecPtr, nextBranchPtr);
2013 if (unlikely(err))
2014 return err;
2015
2016 predecessors.set(treeNodePtr.p->m_node_no);
2017 prevExecPtr = treeNodePtr;
2018 }
2019 } //for 'all request TreeNodes', starting from branchPtr
2020
2021 /**
2022 * 2) After this INNER-joined lookup sequence:
2023 * Recursively append a *single* INNER-joined scan-branch, if found.
2024 *
2025 * Note that this branch, including any non-INNER joined branches below,
2026 * are planned to be executed in *parallel* after the 'prevExecPtr',
2027 * which is the end of the sequence of INNER-lookups.
2028 */
2029 treeNodePtr = branchPtr; //Start over
2030 while (list.next(treeNodePtr))
2031 {
2032 /**
2033 * Scan has to be executed in same order as found in the
2034 * list of TreeNodes. (Legacy of the original SPJ-API result protocol)
2035 */
2036 if (treeNodePtr.p->m_predecessors.isclear() &&
2037 predecessors.contains(treeNodePtr.p->m_ancestors) &&
2038 treeNodePtr.p->m_bits & TreeNode::T_INNER_JOIN)
2039 {
2040 DEBUG("planSequentialExec, append INNER-joined scan-branch at treeNode: "
2041 << treeNodePtr.p->m_node_no);
2042
2043 ndbassert(treeNodePtr.p->isScan());
2044 const Uint32 err = planSequentialExec(requestPtr, treeNodePtr, prevExecPtr,
2045 NullTreeNodePtr);
2046 if (unlikely(err))
2047 return err;
2048 break;
2049 }
2050 } //for 'all request TreeNodes', starting from branchPtr
2051
2052
2053 /**
2054 * Note: All INNER-Joins within current 'branch' will now have been handled,
2055 * either directly within this method at 1), or by recursively calling it in 2).
2056 *
2057 * 3a) collect any non-INNER-joined lookup branches
2058 */
2059 Ptr<TreeNode> outerBranches[NDB_SPJ_MAX_TREE_NODES+1];
2060 int outerCnt = 0;
2061
2062 treeNodePtr = branchPtr; //Start over
2063 while (list.next(treeNodePtr))
2064 {
2065 if (treeNodePtr.p->m_predecessors.isclear() &&
2066 predecessors.contains(treeNodePtr.p->m_ancestors))
2067 {
2068 if (treeNodePtr.p->isLookup() &&
2069 !branchPtr.p->m_predecessors.contains(treeNodePtr.p->m_ancestors))
2070 {
2071 // A non-INNER joined lookup-TreeNode
2072 outerBranches[outerCnt++] = treeNodePtr;
2073 }
2074 }
2075 } //for 'all request TreeNodes', starting from branchPtr
2076
2077 /**
2078 * 3b) Append the non-INNER-joined lookup branches to the end of the INNER-joined
2079 * lookup sequence, (at 'prevExecPtr'), will be executed in a sequence, parallell
2080 * with the scan branch from 2).
2081 *
2082 */
2083 outerBranches[outerCnt] = nextBranchPtr; //Resume point for last
2084 for (int i = 0; i < outerCnt; i++)
2085 {
2086 DEBUG("planSequentialExec, append non-INNER-joined branch no: "
2087 << outerBranches[i].p->m_node_no);
2088
2089 const Uint32 err = planSequentialExec(requestPtr, outerBranches[i], prevExecPtr,
2090 outerBranches[i+1]); //RESUME point
2091 if (unlikely(err))
2092 return err;
2093 }
2094
2095 /**
2096 * 4) Append any non-INNER joined scan branches to the end of the INNER-joined
2097 * lookup sequence, (at 'prevExecPtr')
2098 */
2099 treeNodePtr = branchPtr; //Start over
2100 while (list.next(treeNodePtr))
2101 {
2102 if (treeNodePtr.p->m_predecessors.isclear() &&
2103 predecessors.contains(treeNodePtr.p->m_ancestors))
2104 {
2105 if (!branchPtr.p->m_predecessors.contains(treeNodePtr.p->m_ancestors))
2106 {
2107 jam();
2108 ndbassert(treeNodePtr.p->isScan());
2109
2110 DEBUG("planSequentialExec, append non-INNER-joined scan-treeNode: "
2111 << treeNodePtr.p->m_node_no
2112 << ", to branch at: " << branchPtr.p->m_node_no
2113 << ", as 'descendant' of node: " << prevExecPtr.p->m_node_no);
2114
2115 const Uint32 err = planSequentialExec(requestPtr, treeNodePtr, prevExecPtr,
2116 NullTreeNodePtr);
2117 if (unlikely(err))
2118 return err;
2119 }
2120 }
2121 }
2122
2123 return 0;
2124 } // ::planSequentialExec
2125
2126
2127 /**
2128 * appendTreeNode()
2129 *
2130 * Appends 'treeNodePtr' to the execution plan after 'prevExecPtr'.
2131 * In case 'treeNodePtr' is part of an outer joined tree branch,
2132 * 'nextBranchPtr' may refer a 'resume point' outside of the current
2133 * outer joined branch.
2134 *
2135 * In case of execution of a row set within the current branch is
2136 * terminated due to no INNER-joined matches found, execution will be
2137 * resumed at 'nextBranchPtr'
2138 *
2139 * Fills in the 'predecessors' and 'dependencies' bitmask.
2140 *
2141 * Sets of extra 'scheduling policy' described by 'm_resumeEvents'
2142 * and 'm_resumePtrI', and BUFFERing of rows and/or their match bitmask
2143 * as required by the choosen scheduling.
2144 */
2145 Uint32
appendTreeNode(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Ptr<TreeNode> prevExecPtr,const Ptr<TreeNode> nextBranchPtr)2146 Dbspj::appendTreeNode(Ptr<Request> requestPtr,
2147 Ptr<TreeNode> treeNodePtr,
2148 Ptr<TreeNode> prevExecPtr,
2149 const Ptr<TreeNode> nextBranchPtr)
2150 {
2151 if (prevExecPtr.isNull())
2152 {
2153 // Assert that no further action would have been required below.
2154 ndbassert(nextBranchPtr.isNull());
2155 ndbassert(treeNodePtr.p->m_parentPtrI == RNIL);
2156 ndbassert(treeNodePtr.p->m_scanAncestorPtrI == RNIL);
2157 return 0;
2158 }
2159
2160 DEBUG("appendTreeNode, append treeNode: " << treeNodePtr.p->m_node_no
2161 << ", as 'descendant' of node: " << prevExecPtr.p->m_node_no);
2162 {
2163 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
2164
2165 // Add treeNode to the execution plan:
2166 Local_dependency_map execList(pool, prevExecPtr.p->m_next_nodes);
2167 if (unlikely(!execList.append(&treeNodePtr.i, 1)))
2168 {
2169 jam();
2170 return DbspjErr::OutOfQueryMemory;
2171 }
2172 }
2173
2174 treeNodePtr.p->m_predecessors.bitOR(prevExecPtr.p->m_predecessors);
2175 treeNodePtr.p->m_predecessors.set(prevExecPtr.p->m_node_no);
2176
2177 treeNodePtr.p->m_dependencies = prevExecPtr.p->m_dependencies;
2178 treeNodePtr.p->m_dependencies.set(prevExecPtr.p->m_node_no);
2179
2180 ndbassert(treeNodePtr.p->m_predecessors.contains(treeNodePtr.p->m_dependencies));
2181 ndbassert(treeNodePtr.p->m_dependencies.contains(treeNodePtr.p->m_ancestors));
2182
2183 /**
2184 * Below we set up any special scheduling policy.
2185 *
2186 * If nothing is set, completion of a request will submit new request(s) for
2187 * all 'm_next_nodes' in *parallel*. The result rows returned from the request
2188 * will be used directly as the 'parentRow' to produce the new request(s).
2189 *
2190 * So anything set up below is an exception to this basic rule!
2191 */
2192
2193 /**
2194 * If a 'next branch' is specified, the current branch should start execution
2195 * from this branch when it completes. This is part of our load regulation logic
2196 * which prevents it from overflowing the job buffers due to a scan driven
2197 * star-join query topology submitting all its LQHKEYREQSs at once.
2198 *
2199 * Instead we now start only the first child lookup operation when a scan
2200 * completes. Completion of requests from this lookup operation will in turn
2201 * either start the next INNER-joined lookup when a TRANSID_AI result arrives,
2202 * or use the 'next branch'-RESUME logic set up below if not INNER-joined.
2203 * Together this maintain a steady pace of LQHKEYREQSs being submitted, where
2204 * the total number of submitted REQs in the pipeline will be <= number
2205 * of rows returned from the preceeding scan batch. (A 1::1 fanout)
2206 *
2207 * The 'next branch'-RESUME logic is controlled by setting the following
2208 * m_resumeEvents flags:
2209 *
2210 * - TN_ENQUEUE_OP: The first TreeNode in a 'next branch' will enqueue
2211 * the correlation-id of all rows TRANSID_AI-returned from its parent.
2212 * (As opposed to submit it for immediate execution). Any of the
2213 * RESUME-actions below will later pick one of the ENQUED rows
2214 * for execution. (Also implies that the parent of any ENQUEUing-TreeNode
2215 * need to BUFFER_ROW).
2216 * - TN_RESUME_REF: If we get a LQHKEYREF-reply it terminate any further
2217 * INNER-join operations originating from the head of this branch.
2218 * As this frees a scheduling quota, we may start an operation from
2219 * the nextBranch to be executed.
2220 * - TN_RESUME_CONF: Set only for the last operation in the branch.
2221 * When it succesfully completes, a scheduling quota is available,
2222 * and we may start an operation from the nextBranch to be executed.
2223 */
2224 if (!nextBranchPtr.isNull())
2225 {
2226 // Should only be used for lookup resuming another branch of lookups,
2227 // Within the same scanAncestor scope.
2228 ndbassert(treeNodePtr.p->isLookup());
2229 ndbassert(nextBranchPtr.p->isLookup());
2230 ndbassert(nextBranchPtr.p->m_scanAncestorPtrI == treeNodePtr.p->m_scanAncestorPtrI);
2231
2232 treeNodePtr.p->m_resumePtrI = nextBranchPtr.i;
2233 nextBranchPtr.p->m_predecessors.set(treeNodePtr.p->m_node_no);
2234
2235 /**
2236 * Only the last TreeNode in a branch should have TN_RESUME_CONF set.
2237 * If we now append to a branch having a resume position, remove RESUME_CONF.
2238 */
2239 if (prevExecPtr.p->m_resumePtrI != RNIL)
2240 {
2241 ndbassert(prevExecPtr.p->m_resumeEvents & TreeNode::TN_RESUME_REF);
2242 // Only for the last resuming TreeNode
2243 prevExecPtr.p->m_resumeEvents &= ~TreeNode::TN_RESUME_CONF;
2244 }
2245
2246 // Assume: Last node in this outer-joined tree branch: Always resume 'next'.
2247 treeNodePtr.p->m_resumeEvents |= TreeNode::TN_RESUME_CONF |
2248 TreeNode::TN_RESUME_REF;
2249
2250 // The 'to be resumed' operations are enqueued at the head of nextBranch.
2251 nextBranchPtr.p->m_resumeEvents |= TreeNode::TN_ENQUEUE_OP;
2252 }
2253
2254 /** Example:
2255 *
2256 * scan1
2257 * / \ ====INNER-join executed as===> scan1 -> scan2 -> scan3
2258 * scan2 scan3
2259 *
2260 * Considdering case above, both scan2 and scan3 has scan1 as its scanAncestor.
2261 * In an INNER-joined execution plan, we will take advantage of that
2262 * a match between scan1 join scan2 rows are required, else 'join scan3' could
2263 * be skipped. Thus, even if scan1 is the scan-ancestor of scan3, we will
2264 * execute scan2 inbetween these.
2265 *
2266 * Note that the result from scan2 may have multiple TRANSID_AI results returned
2267 * for each row from scan1. Thus we can't directly use the returned scan2 rows
2268 * to trigger production of the scan3 request. (Due to cardinality mismatch).
2269 * The scan3 request has to be produced based on scan1 results!
2270 *
2271 * We set up the scheduling policy below to solve this:
2272 * - TN_EXEC_WAIT is set on 'scan3', which will prevent TRANSID_AI
2273 * results from scan2 from submiting operations to scan3.
2274 * - TN_RESUME_NODE is set on 'scan3' which will result in
2275 * ::resumeBufferedNode() being called when all TreeNodes
2276 * which we depends in has completed their batches.
2277 * (Also implies that the parent of any to-be-resumed-nodes
2278 * need to BUFFER_ROW).
2279 *
2280 * ::resumeBufferedNode() will iterate all its buffered parent results.
2281 * For each row we will check if the required INNER-join matches from
2282 * the TreeNodes it has INNER-join dependencies on. Non-matching parent
2283 * rows are skipped from further requests.
2284 *
2285 * We maintain the found matches in the m_match-bitmask in the
2286 * BUFFER structure of each TreeNode scanAncestor. Below we set
2287 * the T_BUFFER_MATCH on the scanAncestor, and all scans inbetween
2288 * in order to having the match-bitmap being set up.
2289 */
2290 if (treeNodePtr.p->isScan() &&
2291 treeNodePtr.p->m_scanAncestorPtrI != RNIL)
2292 {
2293 Ptr<TreeNode> scanAncestorPtr;
2294 m_treenode_pool.getPtr(scanAncestorPtr, treeNodePtr.p->m_scanAncestorPtrI);
2295 Ptr<TreeNode> ancestorPtr(scanAncestorPtr);
2296
2297 // Note that scans are always added to exec plan such that their
2298 // relative order is kept.
2299
2300 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2301 while (list.next(ancestorPtr) && ancestorPtr.i != treeNodePtr.i)
2302 {
2303 if (ancestorPtr.p->isScan() &&
2304 treeNodePtr.p->m_dependencies.get(ancestorPtr.p->m_node_no))
2305 {
2306 /**
2307 * 'ancestorPtr' is a scan executed inbetween this scan and its scanAncestor.
2308 * It is not among the ancestors of the TreeNode to be executed
2309 */
2310
2311 // Need 'resume-node' scheduling in preparation for 'next' scan-branch:
2312 treeNodePtr.p->m_resumeEvents |= TreeNode::TN_EXEC_WAIT |
2313 TreeNode::TN_RESUME_NODE;
2314
2315 requestPtr.p->m_bits |= Request::RT_BUFFERS;
2316 scanAncestorPtr.p->m_bits |= TreeNode::T_BUFFER_MAP |
2317 TreeNode::T_BUFFER_MATCH;
2318
2319 /**
2320 * BUFFER_MATCH all scan ancestors of this treeNode which we
2321 * depends on (May exclude some outer-joined scan branches.)
2322 */
2323 if (!ancestorPtr.p->isLeaf())
2324 {
2325 ancestorPtr.p->m_bits |= TreeNode::T_BUFFER_MAP |
2326 TreeNode::T_BUFFER_MATCH;
2327 }
2328 }
2329 }
2330 }
2331
2332 /**
2333 * Only the result rows from the 'prevExec' is directly available when
2334 * operations for this TreeNode is scheduled. If that is not the parent
2335 * of this TreeNode, we have to BUFFER the parent rows such that
2336 * they can be looked up by the correlationId when needed. NOTE, that
2337 * all Lookup result rows having the same scanAncestor, will also
2338 * share the same correlationId as their scanAncestor. Such that the
2339 * correlationId from a prevExec result row, may be used to
2340 * BUFFER_MAP-locate the related parent rows.
2341 *
2342 * Also take care of buffering parent rows for enqueued ops and
2343 * to-be-resumed nodes, as described above.
2344 */
2345 if (treeNodePtr.p->m_parentPtrI != prevExecPtr.i ||
2346 (treeNodePtr.p->m_resumeEvents & TreeNode::TN_ENQUEUE_OP) ||
2347 (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_NODE))
2348 {
2349 /**
2350 * As execution of this tree branch is not initiated by
2351 * its own parent, we need to buffer the parent rows
2352 * such that they can be located when needed.
2353 */
2354 Ptr<TreeNode> parentPtr;
2355 m_treenode_pool.getPtr(parentPtr, treeNodePtr.p->m_parentPtrI);
2356 parentPtr.p->m_bits |= TreeNode::T_BUFFER_MAP |
2357 TreeNode::T_BUFFER_ROW;
2358 requestPtr.p->m_bits |= Request::RT_BUFFERS;
2359 }
2360
2361 return 0;
2362 }
2363
2364
2365 void
dumpExecPlan(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)2366 Dbspj::dumpExecPlan(Ptr<Request> requestPtr,
2367 Ptr<TreeNode> treeNodePtr)
2368 {
2369 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
2370 const Local_dependency_map nextExec(pool, treeNodePtr.p->m_next_nodes);
2371 Dependency_map::ConstDataBufferIterator it;
2372
2373 DEBUG("TreeNode no: " << treeNodePtr.p->m_node_no
2374 << ", coverage are: " << treeNodePtr.p->m_coverage.rep.data[0]
2375 << ", ancestors are: " << treeNodePtr.p->m_ancestors.rep.data[0]
2376 << ", predecessors are: " << treeNodePtr.p->m_predecessors.rep.data[0]
2377 << ", depending on: " << treeNodePtr.p->m_dependencies.rep.data[0]
2378 );
2379
2380 if (treeNodePtr.p->isLookup())
2381 {
2382 DEBUG(" 'Lookup'-node");
2383 }
2384 else if (treeNodePtr.p->isScan())
2385 {
2386 DEBUG(" '(Index-)Scan'-node");
2387 }
2388
2389 if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_EXEC_WAIT)
2390 {
2391 DEBUG(" has EXEC_WAIT");
2392 }
2393
2394 if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_ENQUEUE_OP)
2395 {
2396 DEBUG(" ENQUEUE, wait to be resumed");
2397 }
2398 if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_NODE)
2399 {
2400 DEBUG(" has RESUME_NODE");
2401 }
2402 if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_CONF)
2403 {
2404 DEBUG(" has RESUME_CONF");
2405 }
2406 if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_REF)
2407 {
2408 DEBUG(" has RESUME_REF");
2409 }
2410
2411 static const Uint32 BufferAll = (TreeNode::T_BUFFER_ROW|TreeNode::T_BUFFER_MATCH);
2412 if ((treeNodePtr.p->m_bits & BufferAll) == BufferAll)
2413 {
2414 DEBUG(" BUFFER 'ROWS'+'MATCH'");
2415 }
2416 else if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ROW)
2417 {
2418 DEBUG(" BUFFER 'ROWS'");
2419 }
2420 else if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MATCH)
2421 {
2422 DEBUG(" BUFFER 'MATCH'");
2423 }
2424
2425 if (treeNodePtr.p->m_resumePtrI != RNIL)
2426 {
2427 Ptr<TreeNode> resumeTreeNodePtr;
2428 m_treenode_pool.getPtr(resumeTreeNodePtr, treeNodePtr.p->m_resumePtrI);
2429 DEBUG(" may resume node: " << resumeTreeNodePtr.p->m_node_no);
2430 }
2431
2432 for (nextExec.first(it); !it.isNull(); nextExec.next(it))
2433 {
2434 Ptr<TreeNode> nextPtr;
2435 m_treenode_pool.getPtr(nextPtr, * it.data);
2436 DEBUG(" TreeNode no: " << treeNodePtr.p->m_node_no
2437 << ", has nextExec: " << nextPtr.p->m_node_no);
2438 }
2439
2440 for (nextExec.first(it); !it.isNull(); nextExec.next(it))
2441 {
2442 Ptr<TreeNode> nextPtr;
2443 m_treenode_pool.getPtr(nextPtr, * it.data);
2444 dumpExecPlan(requestPtr, nextPtr);
2445 }
2446 }
2447
2448
2449 Uint32
createNode(Build_context & ctx,Ptr<Request> requestPtr,Ptr<TreeNode> & treeNodePtr)2450 Dbspj::createNode(Build_context& ctx, Ptr<Request> requestPtr,
2451 Ptr<TreeNode> & treeNodePtr)
2452 {
2453 /**
2454 * In the future, we can have different TreeNode-allocation strategies
2455 * that can be setup using the Build_context
2456 *
2457 */
2458 if (ERROR_INSERTED_CLEAR(17005))
2459 {
2460 ndbout_c("Injecting OutOfOperations error 17005 at line %d file %s",
2461 __LINE__, __FILE__);
2462 jam();
2463 return DbspjErr::OutOfOperations;
2464 }
2465 if (m_treenode_pool.seize(requestPtr.p->m_arena, treeNodePtr))
2466 {
2467 DEBUG("createNode - seize -> ptrI: " << treeNodePtr.i);
2468 new (treeNodePtr.p) TreeNode(requestPtr.i);
2469 ctx.m_node_list[ctx.m_cnt] = treeNodePtr;
2470 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2471 list.addLast(treeNodePtr);
2472 treeNodePtr.p->m_node_no = ctx.m_cnt;
2473 return 0;
2474 }
2475 return DbspjErr::OutOfOperations;
2476 }
2477
2478 /**
2479 * Depending on query type, a 'prepare' phase might be required
2480 * before starting the real data retrieval from the query.
2481 *
2482 * All ::exec<FOO> methods handling replies related to the query
2483 * prepare phase, should call ::checkPrepareComplete() before
2484 * they return.
2485 */
2486 void
prepare(Signal * signal,Ptr<Request> requestPtr)2487 Dbspj::prepare(Signal* signal,
2488 Ptr<Request> requestPtr)
2489 {
2490 Uint32 err = 0;
2491 if (requestPtr.p->m_bits & Request::RT_NEED_PREPARE)
2492 {
2493 jam();
2494 requestPtr.p->m_outstanding = 0;
2495 requestPtr.p->m_state = Request::RS_PREPARING;
2496
2497 Ptr<TreeNode> nodePtr;
2498 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2499 for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
2500 {
2501 jam();
2502 /**
2503 * Verify existence of all involved tables.
2504 */
2505 err = checkTableError(nodePtr);
2506 if (unlikely(err))
2507 {
2508 jam();
2509 break;
2510 }
2511 if (nodePtr.p->m_bits & TreeNode::T_NEED_PREPARE)
2512 {
2513 jam();
2514 ndbassert(nodePtr.p->m_info != NULL);
2515 ndbassert(nodePtr.p->m_info->m_prepare != NULL);
2516 (this->*(nodePtr.p->m_info->m_prepare))(signal, requestPtr, nodePtr);
2517 }
2518 }
2519
2520 /**
2521 * preferably RT_NEED_PREPARE should only be set if blocking
2522 * calls are used, in which case m_outstanding should have been increased
2523 */
2524 ndbassert(err || requestPtr.p->m_outstanding);
2525 }
2526 if (unlikely(err))
2527 {
2528 jam();
2529 abort(signal, requestPtr, err);
2530 return;
2531 }
2532 }
2533
2534 /**
2535 * Check if all outstanding 'prepare' work has completed.
2536 * After prepare completion, start the query itself.
2537 *
2538 * A prepare completion could also complete the entire request.
2539 * Thus, checkBatchComplete() is also checked as part of
2540 * prepare completion.
2541 */
2542 void
checkPrepareComplete(Signal * signal,Ptr<Request> requestPtr)2543 Dbspj::checkPrepareComplete(Signal* signal, Ptr<Request> requestPtr)
2544 {
2545 if (requestPtr.p->m_outstanding > 0)
2546 {
2547 return;
2548 }
2549
2550 do //To simplify error/exit handling, no real loop
2551 {
2552 jam();
2553 if (unlikely((requestPtr.p->m_state & Request::RS_ABORTING) != 0))
2554 {
2555 jam();
2556 break;
2557 }
2558
2559 Ptr<TreeNode> nodePtr;
2560 {
2561 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2562 ndbrequire(list.first(nodePtr));
2563 }
2564 Uint32 err = checkTableError(nodePtr);
2565 if (unlikely(err != 0))
2566 {
2567 jam();
2568 abort(signal, requestPtr, err);
2569 break;
2570 }
2571
2572 requestPtr.p->m_state = Request::RS_RUNNING;
2573 ndbrequire(nodePtr.p->m_info != 0 && nodePtr.p->m_info->m_start != 0);
2574 (this->*(nodePtr.p->m_info->m_start))(signal, requestPtr, nodePtr);
2575 }
2576 while (0);
2577
2578 //Possibly completed (or failed) entire request.
2579 checkBatchComplete(signal, requestPtr);
2580 }
2581
2582 /**
2583 * Check if all outstanding work for 'Request' has completed.
2584 *
2585 * All ::exec<FOO> methods handling replies related to query
2586 * execution, *must* call ::checkBatchComplete() before returning.
2587 */
2588 void
checkBatchComplete(Signal * signal,Ptr<Request> requestPtr)2589 Dbspj::checkBatchComplete(Signal* signal, Ptr<Request> requestPtr)
2590 {
2591 if (unlikely(requestPtr.p->m_outstanding == 0))
2592 {
2593 jam();
2594 batchComplete(signal, requestPtr);
2595 }
2596 }
2597
2598 /**
2599 * Request has completed all outstanding work.
2600 * Signal API about completion status and cleanup
2601 * resources if appropriate.
2602 *
2603 * NOTE: A Request might ::batchComplete() twice if
2604 * a completion phase is required. It will then be called
2605 * the last time from ::complete()
2606 */
2607 void
batchComplete(Signal * signal,Ptr<Request> requestPtr)2608 Dbspj::batchComplete(Signal* signal, Ptr<Request> requestPtr)
2609 {
2610 ndbrequire(requestPtr.p->m_outstanding == 0); // "definition" of batchComplete
2611
2612 bool is_complete = requestPtr.p->m_cnt_active == 0;
2613 bool need_complete_phase = requestPtr.p->m_bits & Request::RT_NEED_COMPLETE;
2614
2615 if (requestPtr.p->isLookup())
2616 {
2617 ndbassert(requestPtr.p->m_cnt_active == 0);
2618 }
2619
2620 if (!is_complete || (is_complete && need_complete_phase == false))
2621 {
2622 /**
2623 * one batch complete, and either
2624 * - request not complete
2625 * - or not complete_phase needed
2626 */
2627 jam();
2628
2629 if ((requestPtr.p->m_state & Request::RS_ABORTING) != 0)
2630 {
2631 ndbassert(is_complete);
2632 }
2633
2634 prepareNextBatch(signal, requestPtr);
2635 sendConf(signal, requestPtr, is_complete);
2636 }
2637 else if (is_complete && need_complete_phase)
2638 {
2639 jam();
2640 /**
2641 * run complete-phase
2642 */
2643 complete(signal, requestPtr);
2644 return;
2645 }
2646
2647 if (requestPtr.p->m_cnt_active == 0)
2648 {
2649 jam();
2650 /**
2651 * Entire Request completed
2652 */
2653 cleanup(requestPtr);
2654 }
2655 else
2656 {
2657 jam();
2658 /**
2659 * Cleanup the TreeNode branches getting another
2660 * batch of result rows.
2661 */
2662 cleanupBatch(requestPtr);
2663 }
2664 }
2665
2666 /**
2667 * Locate next TreeNode(s) to retrieve more rows from.
2668 *
2669 * Calculate set of the 'm_active_tree_nodes' we will receive from in NEXTREQ.
2670 * Add these TreeNodes to the cursor list to be iterated.
2671 */
2672 void
prepareNextBatch(Signal * signal,Ptr<Request> requestPtr)2673 Dbspj::prepareNextBatch(Signal* signal, Ptr<Request> requestPtr)
2674 {
2675 requestPtr.p->m_cursor_nodes.init();
2676 requestPtr.p->m_active_tree_nodes.clear();
2677
2678 if (requestPtr.p->m_cnt_active == 0)
2679 {
2680 jam();
2681 return;
2682 }
2683
2684 DEBUG("prepareNextBatch, request: " << requestPtr.i);
2685
2686 if (requestPtr.p->m_bits & Request::RT_REPEAT_SCAN_RESULT)
2687 {
2688 /**
2689 * If REPEAT_SCAN_RESULT we handle bushy scans by return more *new* rows
2690 * from only one of the active child scans. If there are multiple
2691 * bushy scans not being able to return their current result set in
2692 * a single batch, result sets from the other child scans are repeated
2693 * until all rows has been returned to the API client.
2694 *
2695 * Hence, the cross joined results from the bushy scans are partly
2696 * produced within the SPJ block on a 'batchsize granularity',
2697 * and partly is the responsibility of the API-client by iterating
2698 * the result rows within the current result batches.
2699 * (Opposed to non-REPEAT_SCAN_RESULT, the client only have to care about
2700 * the current batched rows - no buffering is required)
2701 */
2702 jam();
2703 Ptr<TreeNode> nodePtr;
2704 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2705
2706 /**
2707 * Locate last 'TN_ACTIVE' TreeNode which is the only one choosen
2708 * to return more *new* rows.
2709 */
2710 for (list.last(nodePtr); !nodePtr.isNull(); list.prev(nodePtr))
2711 {
2712 if (nodePtr.p->m_state == TreeNode::TN_ACTIVE)
2713 {
2714 jam();
2715 DEBUG("Will fetch more from 'active' m_node_no: " << nodePtr.p->m_node_no);
2716 /**
2717 * A later NEXTREQ will request a *new* batch of rows from this TreeNode.
2718 */
2719 registerActiveCursor(requestPtr, nodePtr);
2720 break;
2721 }
2722 }
2723
2724 /**
2725 * Restart/repeat other (fragment scan) child batches which:
2726 * - Being 'after' nodePtr located above.
2727 * - Not being an ancestor of (depends on) any 'active' TreeNode.
2728 * (As these scans are started when rows from these parent nodes
2729 * arrives.)
2730 */
2731 if (!nodePtr.isNull())
2732 {
2733 jam();
2734 DEBUG("Calculate 'active', w/ cursor on m_node_no: " << nodePtr.p->m_node_no);
2735
2736 /* Restart any partial fragment-scans after this 'TN_ACTIVE' TreeNode */
2737 for (list.next(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
2738 {
2739 jam();
2740 if (!nodePtr.p->m_predecessors.overlaps (requestPtr.p->m_active_tree_nodes))
2741 {
2742 jam();
2743 ndbrequire(nodePtr.p->m_state != TreeNode::TN_ACTIVE);
2744 ndbrequire(nodePtr.p->m_info != 0);
2745 if (nodePtr.p->m_info->m_parent_batch_repeat != 0)
2746 {
2747 jam();
2748 (this->*(nodePtr.p->m_info->m_parent_batch_repeat))(signal,
2749 requestPtr,
2750 nodePtr);
2751 }
2752 }
2753 /**
2754 * Adapt to SPJ-API protocol legacy:
2755 * API always assumed that any node having an 'active' node as
2756 * ancestor gets a new batch of result rows. So we didn't explicitly
2757 * set the 'active' bit for these siblings, as it was implicit.
2758 * In addition, we might now have (INNER-join) dependencies outside
2759 * of the set of ancestor nodes. If such a dependent node, not being one
2760 * of our ancestor, is 'active' it will also re-activate this TreeNode.
2761 * Has to inform the API about that.
2762 */
2763 else if (!nodePtr.p->m_ancestors.overlaps (requestPtr.p->m_active_tree_nodes))
2764 {
2765 requestPtr.p->m_active_tree_nodes.set(nodePtr.p->m_node_no);
2766 }
2767 }
2768 } // if (!nodePtr.isNull()
2769 }
2770 else // not 'RT_REPEAT_SCAN_RESULT'
2771 {
2772 /**
2773 * If not REPEAT_SCAN_RESULT multiple active TreeNodes may return their
2774 * remaining result simultaneously. In case of bushy-scans, these
2775 * concurrent result streams are cross joins of each other
2776 * in SQL terms. In order to produce the cross joined result, it is
2777 * the responsibility of the API-client to buffer these streams and
2778 * iterate them to produce the cross join.
2779 */
2780 jam();
2781 Ptr<TreeNode> nodePtr;
2782 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2783 TreeNodeBitMask predecessors_of_active;
2784
2785 for (list.last(nodePtr); !nodePtr.isNull(); list.prev(nodePtr))
2786 {
2787 /**
2788 * If we are active (i.e not consumed all rows originating
2789 * from parent rows) and we are not in the set of parents
2790 * for any active child:
2791 *
2792 * Then, this is a position that execSCAN_NEXTREQ should continue
2793 */
2794 if (nodePtr.p->m_state == TreeNode::TN_ACTIVE &&
2795 !predecessors_of_active.get(nodePtr.p->m_node_no))
2796 {
2797 jam();
2798 DEBUG("Add 'active' m_node_no: " << nodePtr.p->m_node_no);
2799 registerActiveCursor(requestPtr, nodePtr);
2800 predecessors_of_active.bitOR(nodePtr.p->m_predecessors);
2801 }
2802 }
2803 } // if (RT_REPEAT_SCAN_RESULT)
2804
2805 DEBUG("Calculated 'm_active_tree_nodes': " << requestPtr.p->m_active_tree_nodes.rep.data[0]);
2806 }
2807
2808 void
registerActiveCursor(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)2809 Dbspj::registerActiveCursor(Ptr<Request> requestPtr, Ptr<TreeNode> treeNodePtr)
2810 {
2811 Uint32 bit = treeNodePtr.p->m_node_no;
2812 ndbrequire(!requestPtr.p->m_active_tree_nodes.get(bit));
2813 requestPtr.p->m_active_tree_nodes.set(bit);
2814
2815 Local_TreeNodeCursor_list list(m_treenode_pool, requestPtr.p->m_cursor_nodes);
2816 #ifdef VM_TRACE
2817 {
2818 Ptr<TreeNode> nodePtr;
2819 for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
2820 {
2821 ndbrequire(nodePtr.i != treeNodePtr.i);
2822 }
2823 }
2824 #endif
2825 list.addFirst(treeNodePtr);
2826 }
2827
2828 void
sendConf(Signal * signal,Ptr<Request> requestPtr,bool is_complete)2829 Dbspj::sendConf(Signal* signal, Ptr<Request> requestPtr, bool is_complete)
2830 {
2831 if (requestPtr.p->isScan())
2832 {
2833 if (unlikely((requestPtr.p->m_state & Request::RS_WAITING) != 0))
2834 {
2835 jam();
2836 /**
2837 * We aborted request ourselves (due to node-failure ?)
2838 * but TC haven't contacted us...so we can't reply yet...
2839 */
2840 ndbrequire(is_complete);
2841 ndbrequire((requestPtr.p->m_state & Request::RS_ABORTING) != 0);
2842 return;
2843 }
2844
2845 if (requestPtr.p->m_errCode == 0)
2846 {
2847 jam();
2848 ScanFragConf * conf=
2849 reinterpret_cast<ScanFragConf*>(signal->getDataPtrSend());
2850 conf->senderData = requestPtr.p->m_senderData;
2851 conf->transId1 = requestPtr.p->m_transId[0];
2852 conf->transId2 = requestPtr.p->m_transId[1];
2853 conf->completedOps = requestPtr.p->m_rows;
2854 conf->fragmentCompleted = is_complete ? 1 : 0;
2855 conf->total_len = requestPtr.p->m_active_tree_nodes.rep.data[0];
2856
2857 /**
2858 * Collect the map of nodes still having more rows to return.
2859 * Note that this 'activeMask' is returned as part of the
2860 * extended format of the ScanFragConf signal introduced in wl7636.
2861 * If returned to a TC node not yet upgraded, the extended part
2862 * of the ScanFragConf is simply ignored.
2863 */
2864 Uint32 activeMask = 0;
2865 Ptr<TreeNode> treeNodePtr;
2866 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2867
2868 for (list.first(treeNodePtr); !treeNodePtr.isNull(); list.next(treeNodePtr))
2869 {
2870 if (treeNodePtr.p->m_state == TreeNode::TN_ACTIVE)
2871 {
2872 DBUG_ASSERT(treeNodePtr.p->m_node_no <= 31);
2873 activeMask |= (1 << treeNodePtr.p->m_node_no);
2874 }
2875 }
2876 conf->activeMask = activeMask;
2877 c_Counters.incr_counter(CI_SCAN_BATCHES_RETURNED, 1);
2878 c_Counters.incr_counter(CI_SCAN_ROWS_RETURNED, requestPtr.p->m_rows);
2879
2880 #ifdef SPJ_TRACE_TIME
2881 const NDB_TICKS now = NdbTick_getCurrentTicks();
2882 const NDB_TICKS then = requestPtr.p->m_save_time;
2883 const Uint64 diff = NdbTick_Elapsed(then,now).microSec();
2884
2885 requestPtr.p->m_sum_rows += requestPtr.p->m_rows;
2886 requestPtr.p->m_sum_running += Uint32(diff);
2887 requestPtr.p->m_cnt_batches++;
2888 requestPtr.p->m_save_time = now;
2889
2890 if (is_complete)
2891 {
2892 Uint32 cnt = requestPtr.p->m_cnt_batches;
2893 ndbout_c("batches: %u avg_rows: %u avg_running: %u avg_wait: %u",
2894 cnt,
2895 (requestPtr.p->m_sum_rows / cnt),
2896 (requestPtr.p->m_sum_running / cnt),
2897 cnt == 1 ? 0 : requestPtr.p->m_sum_waiting / (cnt - 1));
2898 }
2899 #endif
2900
2901 /**
2902 * reset for next batch
2903 */
2904 requestPtr.p->m_rows = 0;
2905 if (!is_complete)
2906 {
2907 jam();
2908 requestPtr.p->m_state |= Request::RS_WAITING;
2909 }
2910 #ifdef DEBUG_SCAN_FRAGREQ
2911 ndbout_c("Dbspj::sendConf() sending SCAN_FRAGCONF ");
2912 printSCAN_FRAGCONF(stdout, signal->getDataPtrSend(),
2913 conf->total_len,
2914 DBLQH);
2915 #endif
2916 sendSignal(requestPtr.p->m_senderRef, GSN_SCAN_FRAGCONF, signal,
2917 ScanFragConf::SignalLength_ext, JBB);
2918 }
2919 else
2920 {
2921 jam();
2922 ndbrequire(is_complete);
2923 ScanFragRef * ref=
2924 reinterpret_cast<ScanFragRef*>(signal->getDataPtrSend());
2925 ref->senderData = requestPtr.p->m_senderData;
2926 ref->transId1 = requestPtr.p->m_transId[0];
2927 ref->transId2 = requestPtr.p->m_transId[1];
2928 ref->errorCode = requestPtr.p->m_errCode;
2929
2930 sendSignal(requestPtr.p->m_senderRef, GSN_SCAN_FRAGREF, signal,
2931 ScanFragRef::SignalLength, JBB);
2932 }
2933 }
2934 else
2935 {
2936 ndbassert(is_complete);
2937 if (requestPtr.p->m_errCode)
2938 {
2939 jam();
2940 Uint32 resultRef = getResultRef(requestPtr);
2941 TcKeyRef* ref = (TcKeyRef*)signal->getDataPtr();
2942 ref->connectPtr = requestPtr.p->m_senderData;
2943 ref->transId[0] = requestPtr.p->m_transId[0];
2944 ref->transId[1] = requestPtr.p->m_transId[1];
2945 ref->errorCode = requestPtr.p->m_errCode;
2946 ref->errorData = 0;
2947
2948 sendTCKEYREF(signal, resultRef, requestPtr.p->m_senderRef);
2949 }
2950 }
2951
2952 if (ERROR_INSERTED(17531))
2953 {
2954 /**
2955 * Takes effect for *next* 'long' SPJ signal which will fail
2956 * to alloc long mem section. Dbspj::execSIGNAL_DROPPED_REP()
2957 * will then be called, which is what we intend to test here.
2958 */
2959 jam();
2960 ErrorSignalReceive= DBSPJ;
2961 ErrorMaxSegmentsToSeize= 1;
2962 }
2963 }
2964
2965 Uint32
getResultRef(Ptr<Request> requestPtr)2966 Dbspj::getResultRef(Ptr<Request> requestPtr)
2967 {
2968 Ptr<TreeNode> nodePtr;
2969 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
2970 for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
2971 {
2972 if (nodePtr.p->isLookup())
2973 {
2974 jam();
2975 return nodePtr.p->m_lookup_data.m_api_resultRef;
2976 }
2977 }
2978 ndbabort();
2979 return 0;
2980 }
2981
2982 /**
2983 * Cleanup resources in preparation for a SCAN_NEXTREQ
2984 * requesting a new batch of rows.
2985 */
2986 void
cleanupBatch(Ptr<Request> requestPtr)2987 Dbspj::cleanupBatch(Ptr<Request> requestPtr)
2988 {
2989 /**
2990 * Needs to be atleast 1 active otherwise we should have
2991 * taken the Request cleanup "path" in batchComplete
2992 */
2993 ndbassert(requestPtr.p->m_cnt_active >= 1);
2994
2995 /**
2996 * Release any buffered rows for the TreeNode branches
2997 * getting new rows.
2998 */
2999 if ((requestPtr.p->m_bits & Request::RT_BUFFERS) != 0)
3000 {
3001 if ((requestPtr.p->m_bits & Request::RT_MULTI_SCAN) != 0)
3002 {
3003 jam();
3004 /**
3005 * A MULTI_SCAN may selectively retrieve rows from only
3006 * some of the (scan-) branches in the Request.
3007 * Selectively release from only these branches.
3008 */
3009 releaseScanBuffers(requestPtr);
3010 }
3011 else
3012 {
3013 jam();
3014 /**
3015 * if not multiple scans in request, simply release all pages allocated
3016 * for row buffers (all rows will be released anyway)
3017 */
3018 // Root node should be the one and only being active
3019 ndbassert(requestPtr.p->m_cnt_active == 1);
3020 ndbassert(requestPtr.p->m_active_tree_nodes.get(0));
3021 releaseRequestBuffers(requestPtr);
3022 }
3023 } //RT_BUFFERS
3024
3025 Ptr<TreeNode> treeNodePtr;
3026 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3027
3028 for (list.first(treeNodePtr); !treeNodePtr.isNull(); list.next(treeNodePtr))
3029 {
3030 /**
3031 * Re-init row buffer structures for those treeNodes getting more rows
3032 * in the following NEXTREQ, including all its childs.
3033 */
3034 if (requestPtr.p->m_active_tree_nodes.get(treeNodePtr.p->m_node_no) ||
3035 requestPtr.p->m_active_tree_nodes.overlaps(treeNodePtr.p->m_predecessors))
3036 {
3037 jam();
3038 treeNodePtr.p->m_rows.init();
3039 }
3040
3041 /* Clear parents 'm_matched' bit for all buffered rows: */
3042 if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MATCH)
3043 {
3044 RowIterator iter;
3045 for (first(treeNodePtr.p->m_rows, iter); !iter.isNull(); next(iter))
3046 {
3047 jam();
3048 RowPtr row;
3049 setupRowPtr(treeNodePtr, row,
3050 iter.m_base.m_ref, iter.m_base.m_row_ptr);
3051
3052 row.m_matched->bitANDC(requestPtr.p->m_active_tree_nodes);
3053 }
3054 }
3055
3056 /**
3057 * Do further cleanup in treeNodes having predecessors getting more rows.
3058 * (Which excludes the restarted treeNode itself)
3059 */
3060 if (requestPtr.p->m_active_tree_nodes.overlaps(treeNodePtr.p->m_predecessors))
3061 {
3062 jam();
3063 /**
3064 * Common TreeNode cleanup:
3065 * Deferred operations will have correlation ids which may refer
3066 * buffered rows released above. These are allocated in
3067 * the m_batchArena released below.
3068 * As an optimization we do not explicitly 'release()' these
3069 * correlation id's:
3070 * - There could easily be some hundreds of them, released
3071 * one by one in loop.
3072 * - At the innermost level the release() is more or less a NOOP
3073 * as Arena allocated memory cant be released for reuse.
3074 */
3075 m_arenaAllocator.release(treeNodePtr.p->m_batchArena);
3076 treeNodePtr.p->m_deferred.init();
3077
3078 /**
3079 * TreeNode-type specific cleanup.
3080 */
3081 if (treeNodePtr.p->m_info->m_parent_batch_cleanup != 0)
3082 {
3083 jam();
3084 (this->*(treeNodePtr.p->m_info->m_parent_batch_cleanup))(requestPtr,
3085 treeNodePtr);
3086 }
3087 }
3088 }
3089 }
3090
3091 void
releaseScanBuffers(Ptr<Request> requestPtr)3092 Dbspj::releaseScanBuffers(Ptr<Request> requestPtr)
3093 {
3094 Ptr<TreeNode> treeNodePtr;
3095 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3096
3097 for (list.first(treeNodePtr); !treeNodePtr.isNull(); list.next(treeNodePtr))
3098 {
3099 /**
3100 * Release buffered rows for all treeNodes getting more rows
3101 * in the following NEXTREQ, including all its childs.
3102 */
3103 if (requestPtr.p->m_active_tree_nodes.get(treeNodePtr.p->m_node_no) ||
3104 requestPtr.p->m_active_tree_nodes.overlaps(treeNodePtr.p->m_predecessors))
3105 {
3106 if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ANY)
3107 {
3108 jam();
3109 releaseNodeRows(requestPtr, treeNodePtr);
3110 }
3111 }
3112 }
3113 }
3114
3115 void
releaseNodeRows(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)3116 Dbspj::releaseNodeRows(Ptr<Request> requestPtr, Ptr<TreeNode> treeNodePtr)
3117 {
3118 /**
3119 * Release all rows associated with tree node
3120 */
3121 DEBUG("releaseNodeRows"
3122 << ", node: " << treeNodePtr.p->m_node_no
3123 << ", request: " << requestPtr.i
3124 );
3125
3126 ndbassert(treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ANY);
3127
3128 Uint32 cnt = 0;
3129 RowIterator iter;
3130 for (first(treeNodePtr.p->m_rows, iter); !iter.isNull(); )
3131 {
3132 jam();
3133 RowRef pos = iter.m_base.m_ref;
3134 next(iter);
3135 releaseRow(treeNodePtr, pos);
3136 cnt ++;
3137 }
3138 DEBUG("RowIterator: released " << cnt << " rows!");
3139
3140 if (treeNodePtr.p->m_rows.m_type == RowCollection::COLLECTION_MAP)
3141 {
3142 jam();
3143 // Release the (now empty) RowMap
3144 RowMap& map = treeNodePtr.p->m_rows.m_map;
3145 if (!map.isNull())
3146 {
3147 jam();
3148 RowRef ref;
3149 map.copyto(ref);
3150 releaseRow(treeNodePtr, ref); // Map was allocated in row memory
3151 }
3152 }
3153 }
3154
3155 void
releaseRow(Ptr<TreeNode> treeNodePtr,RowRef pos)3156 Dbspj::releaseRow(Ptr<TreeNode> treeNodePtr, RowRef pos)
3157 {
3158 // Only when var-alloc, or else stack will be popped wo/ consideration
3159 // to individual rows
3160 const RowCollection& collection = treeNodePtr.p->m_rows;
3161 ndbassert(collection.m_base.m_rowBuffer != NULL);
3162 ndbassert(collection.m_base.m_rowBuffer->m_type == BUFFER_VAR);
3163 ndbassert(pos.m_alloc_type == BUFFER_VAR);
3164
3165 RowBuffer& rowBuffer = *collection.m_base.m_rowBuffer;
3166 Ptr<RowPage> ptr;
3167 m_page_pool.getPtr(ptr, pos.m_page_id);
3168 ((Var_page*)ptr.p)->free_record(pos.m_page_pos, Var_page::CHAIN);
3169 Uint32 free_space = ((Var_page*)ptr.p)->free_space;
3170 if (free_space == Var_page::DATA_WORDS - 1)
3171 {
3172 jam();
3173 Local_RowPage_fifo list(m_page_pool,
3174 rowBuffer.m_page_list);
3175 const bool last = list.hasNext(ptr) == false;
3176 list.remove(ptr);
3177 if (list.isEmpty())
3178 {
3179 jam();
3180 /**
3181 * Don't remove last page...
3182 */
3183 list.addLast(ptr);
3184 rowBuffer.m_var.m_free = free_space;
3185 }
3186 else
3187 {
3188 jam();
3189 if (last)
3190 {
3191 jam();
3192 /**
3193 * If we were last...set m_var.m_free to free_space of newLastPtr
3194 */
3195 Ptr<RowPage> newLastPtr;
3196 ndbrequire(list.last(newLastPtr));
3197 rowBuffer.m_var.m_free = ((Var_page*)newLastPtr.p)->free_space;
3198 }
3199 releasePage(ptr);
3200 }
3201 }
3202 else if (free_space > rowBuffer.m_var.m_free)
3203 {
3204 jam();
3205 Local_RowPage_fifo list(m_page_pool,
3206 rowBuffer.m_page_list);
3207 list.remove(ptr);
3208 list.addLast(ptr);
3209 rowBuffer.m_var.m_free = free_space;
3210 }
3211 }
3212
3213 void
releaseRequestBuffers(Ptr<Request> requestPtr)3214 Dbspj::releaseRequestBuffers(Ptr<Request> requestPtr)
3215 {
3216 DEBUG("releaseRequestBuffers"
3217 << ", request: " << requestPtr.i
3218 );
3219 /**
3220 * Release all pages for request
3221 */
3222 {
3223 Local_RowPage_list freelist(m_page_pool, m_free_page_list);
3224 freelist.prependList(requestPtr.p->m_rowBuffer.m_page_list);
3225 }
3226 requestPtr.p->m_rowBuffer.reset();
3227 }
3228
3229 /**
3230 * Handle that batch for this 'TreeNode' is complete.
3231 */
3232 void
handleTreeNodeComplete(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)3233 Dbspj::handleTreeNodeComplete(Signal * signal, Ptr<Request> requestPtr,
3234 Ptr<TreeNode> treeNodePtr)
3235 {
3236 if ((requestPtr.p->m_state & Request::RS_ABORTING) == 0)
3237 {
3238 jam();
3239 ndbassert(requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
3240 ndbassert(treeNodePtr.p->m_deferred.isEmpty());
3241
3242 /**
3243 * If all predecessors are complete, this has to be reported
3244 * as we might be waiting for this condition to start more
3245 * operations.
3246 */
3247 if (requestPtr.p->m_completed_tree_nodes.contains(treeNodePtr.p->m_predecessors))
3248 {
3249 jam();
3250 reportAncestorsComplete(signal, requestPtr, treeNodePtr);
3251 }
3252 }
3253 }
3254
3255 /**
3256 * Notify any TreeNode(s) to be executed after the completed
3257 * TreeNode that their predecessors has completed their batch.
3258 */
3259 void
reportAncestorsComplete(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)3260 Dbspj::reportAncestorsComplete(Signal * signal, Ptr<Request> requestPtr,
3261 Ptr<TreeNode> treeNodePtr)
3262 {
3263 DEBUG("reportAncestorsComplete: " << treeNodePtr.p->m_node_no);
3264
3265 {
3266 jam();
3267 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
3268 Local_dependency_map nextExec(pool, treeNodePtr.p->m_next_nodes);
3269 Dependency_map::ConstDataBufferIterator it;
3270
3271 for (nextExec.first(it); !it.isNull(); nextExec.next(it))
3272 {
3273 jam();
3274 Ptr<TreeNode> nextTreeNodePtr;
3275 m_treenode_pool.getPtr(nextTreeNodePtr, *it.data);
3276
3277 /**
3278 * Notify all TreeNodes which depends on the completed predecessors.
3279 */
3280 if (requestPtr.p->m_completed_tree_nodes.contains(nextTreeNodePtr.p->m_predecessors))
3281 {
3282 ndbassert(nextTreeNodePtr.p->m_deferred.isEmpty());
3283
3284 if (nextTreeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_NODE)
3285 {
3286 resumeBufferedNode(signal, requestPtr, nextTreeNodePtr);
3287 }
3288
3289 /* Notify only TreeNodes which has requested a completion notify. */
3290 if (nextTreeNodePtr.p->m_bits & TreeNode::T_NEED_REPORT_BATCH_COMPLETED)
3291 {
3292 jam();
3293 ndbassert(nextTreeNodePtr.p->m_info != NULL);
3294 ndbassert(nextTreeNodePtr.p->m_info->m_parent_batch_complete != NULL);
3295 (this->*(nextTreeNodePtr.p->m_info->m_parent_batch_complete))(signal,
3296 requestPtr,
3297 nextTreeNodePtr);
3298 }
3299 reportAncestorsComplete(signal, requestPtr, nextTreeNodePtr);
3300 }
3301 }
3302 }
3303 }
3304
3305 /**
3306 * Set the Request to ABORTING state, and where appropriate,
3307 * inform any participating LDMs about the decission to
3308 * terminate the query.
3309 *
3310 * NOTE: No reply is yet sent to the API. This is taken care of by
3311 * the outermost ::exec<FOO> methods calling either ::checkPrepareComplete()
3312 * or ::checkBatchComplete(), which send a CONF/REF reply when all
3313 * 'outstanding' work is done.
3314 */
3315 void
abort(Signal * signal,Ptr<Request> requestPtr,Uint32 errCode)3316 Dbspj::abort(Signal* signal, Ptr<Request> requestPtr, Uint32 errCode)
3317 {
3318 jam();
3319 if ((requestPtr.p->m_state & Request::RS_ABORTING) != 0)
3320 {
3321 jam();
3322 return;
3323 }
3324
3325 requestPtr.p->m_state |= Request::RS_ABORTING;
3326 requestPtr.p->m_errCode = errCode;
3327
3328 {
3329 Ptr<TreeNode> nodePtr;
3330 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3331 for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
3332 {
3333 jam();
3334 ndbrequire(nodePtr.p->m_info != 0);
3335 if (nodePtr.p->m_info->m_abort != 0)
3336 {
3337 jam();
3338 (this->*(nodePtr.p->m_info->m_abort))(signal, requestPtr, nodePtr);
3339 }
3340 }
3341 }
3342 }
3343
3344 Uint32
nodeFail(Signal * signal,Ptr<Request> requestPtr,NdbNodeBitmask nodes)3345 Dbspj::nodeFail(Signal* signal, Ptr<Request> requestPtr,
3346 NdbNodeBitmask nodes)
3347 {
3348 Uint32 cnt = 0;
3349 Uint32 iter = 0;
3350
3351 {
3352 Ptr<TreeNode> nodePtr;
3353 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3354 for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
3355 {
3356 jam();
3357 ndbrequire(nodePtr.p->m_info != 0);
3358 if (nodePtr.p->m_info->m_execNODE_FAILREP != 0)
3359 {
3360 jam();
3361 iter ++;
3362 cnt += (this->*(nodePtr.p->m_info->m_execNODE_FAILREP))(signal,
3363 requestPtr,
3364 nodePtr, nodes);
3365 }
3366 }
3367 }
3368
3369 if (cnt == 0)
3370 {
3371 jam();
3372 /**
3373 * None of the operations needed NodeFailRep "action"
3374 * check if our TC has died...but...only needed in
3375 * scan case...for lookup...not so...
3376 */
3377 if (requestPtr.p->isLookup())
3378 {
3379 jam();
3380 return 0; //Lookup: Don't care about TC still alive
3381 }
3382 else if (!nodes.get(refToNode(requestPtr.p->m_senderRef)))
3383 {
3384 jam();
3385 return 0; //Scan: Requesting TC is still alive.
3386 }
3387 }
3388
3389 jam();
3390 abort(signal, requestPtr, DbspjErr::NodeFailure);
3391 checkBatchComplete(signal, requestPtr);
3392
3393 return cnt + iter;
3394 }
3395
3396 void
complete(Signal * signal,Ptr<Request> requestPtr)3397 Dbspj::complete(Signal* signal, Ptr<Request> requestPtr)
3398 {
3399 /**
3400 * we need to run complete-phase before sending last SCAN_FRAGCONF
3401 */
3402 Uint32 flags = requestPtr.p->m_state &
3403 (Request::RS_ABORTING | Request::RS_WAITING);
3404
3405 requestPtr.p->m_state = Request::RS_COMPLETING | flags;
3406
3407 // clear bit so that next batchComplete()
3408 // will continue to cleanup
3409 ndbassert((requestPtr.p->m_bits & Request::RT_NEED_COMPLETE) != 0);
3410 requestPtr.p->m_bits &= ~(Uint32)Request::RT_NEED_COMPLETE;
3411 ndbassert(requestPtr.p->m_outstanding == 0);
3412 requestPtr.p->m_outstanding = 0;
3413 {
3414 Ptr<TreeNode> nodePtr;
3415 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3416 for (list.first(nodePtr); !nodePtr.isNull(); list.next(nodePtr))
3417 {
3418 jam();
3419 if (nodePtr.p->m_bits & TreeNode::T_NEED_COMPLETE)
3420 {
3421 jam();
3422 ndbassert(nodePtr.p->m_info != NULL);
3423 ndbassert(nodePtr.p->m_info->m_complete != NULL);
3424 (this->*(nodePtr.p->m_info->m_complete))(signal, requestPtr, nodePtr);
3425 }
3426 }
3427 }
3428
3429 jam();
3430 checkBatchComplete(signal, requestPtr);
3431 }
3432
3433 /**
3434 * Release as much as possible of sub objects owned by this Request,
3435 * including its TreeNodes.
3436 * The Request itself is *not* released yet as it may still be needed
3437 * to track the state of the request. (Set to include RS_DONE)
3438 */
3439 void
cleanup(Ptr<Request> requestPtr)3440 Dbspj::cleanup(Ptr<Request> requestPtr)
3441 {
3442 ndbrequire(requestPtr.p->m_cnt_active == 0);
3443 {
3444 Ptr<TreeNode> nodePtr;
3445 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3446 while (list.removeFirst(nodePtr))
3447 {
3448 jam();
3449 ndbrequire(nodePtr.p->m_info != 0 && nodePtr.p->m_info->m_cleanup != 0);
3450 (this->*(nodePtr.p->m_info->m_cleanup))(requestPtr, nodePtr);
3451
3452 m_treenode_pool.release(nodePtr);
3453 }
3454 }
3455 if (requestPtr.p->isScan())
3456 {
3457 jam();
3458
3459 /**
3460 * If a Request in state RS_WAITING is aborted (node failure?),
3461 * there is no ongoing client request we can reply to.
3462 * We set it to RS_ABORTED state now, a later SCAN_NEXTREQ will
3463 * find the RS_ABORTED request, REF with the abort reason, and
3464 * then complete the cleaning up
3465 *
3466 * NOTE1: If no SCAN_NEXTREQ ever arrives for this Request, it
3467 * is effectively leaked!
3468 *
3469 * NOTE2: During testing I was never able to find any SCAN_NEXTREQ
3470 * arriving for a ABORTED query. So there likely are such
3471 * leaks! Suspect that TC does not send SCAN_NEXTREQ to
3472 * SPJ/LQH blocks affected by a node failure?
3473 */
3474 if (unlikely((requestPtr.p->m_state & Request::RS_WAITING) != 0))
3475 {
3476 jam();
3477 requestPtr.p->m_state = Request::RS_ABORTED;
3478 return;
3479 }
3480 m_scan_request_hash.remove(requestPtr, *requestPtr.p);
3481 }
3482 else
3483 {
3484 jam();
3485 m_lookup_request_hash.remove(requestPtr, *requestPtr.p);
3486 }
3487 releaseRequestBuffers(requestPtr);
3488 ArenaHead ah = requestPtr.p->m_arena;
3489 m_request_pool.release(requestPtr);
3490 m_arenaAllocator.release(ah);
3491 }
3492
3493 void
cleanup_common(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)3494 Dbspj::cleanup_common(Ptr<Request> requestPtr, Ptr<TreeNode> treeNodePtr)
3495 {
3496 jam();
3497
3498 // Release TreeNode object allocated in the Request 'global' m_arena.
3499 // (Actualy obsolete by entire Request::m_arena released later)
3500 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
3501 {
3502 Local_dependency_map list(pool, treeNodePtr.p->m_child_nodes);
3503 list.release();
3504 }
3505
3506 {
3507 Local_pattern_store pattern(pool, treeNodePtr.p->m_keyPattern);
3508 pattern.release();
3509 }
3510
3511 {
3512 Local_pattern_store pattern(pool, treeNodePtr.p->m_attrParamPattern);
3513 pattern.release();
3514 }
3515
3516 // Correlation ids for deferred operations are allocated in the batch specific
3517 // arena. It is sufficient to release entire memory arena.
3518 m_arenaAllocator.release(treeNodePtr.p->m_batchArena);
3519
3520 if (treeNodePtr.p->m_send.m_keyInfoPtrI != RNIL)
3521 {
3522 jam();
3523 releaseSection(treeNodePtr.p->m_send.m_keyInfoPtrI);
3524 }
3525
3526 if (treeNodePtr.p->m_send.m_attrInfoPtrI != RNIL)
3527 {
3528 jam();
3529 releaseSection(treeNodePtr.p->m_send.m_attrInfoPtrI);
3530 }
3531 }
3532
3533 static
3534 bool
spjCheckFailFunc(const char * predicate,const char * file,const unsigned line,const Uint32 instance)3535 spjCheckFailFunc(const char* predicate,
3536 const char* file,
3537 const unsigned line,
3538 const Uint32 instance)
3539 {
3540 g_eventLogger->info("DBSPJ %u : Failed spjCheck (%s) "
3541 "at line %u of %s.",
3542 instance,
3543 predicate,
3544 line,
3545 file);
3546 return false;
3547 }
3548
3549 #define spjCheck(check) \
3550 ((check)?true: \
3551 spjCheckFailFunc(#check, __FILE__, __LINE__, instance())) \
3552
3553
3554 bool
checkRequest(const Ptr<Request> requestPtr)3555 Dbspj::checkRequest(const Ptr<Request> requestPtr)
3556 {
3557 jam();
3558
3559 /**
3560 * We check the request, with individual assertions
3561 * affecting the overall result code
3562 * We attempt to dump the request if there's a problem
3563 * Dumping is done last to avoid problems with iterating
3564 * lists concurrently + IntrusiveList.
3565 * So checks should record the problem type etc, but not
3566 * ndbabort() immediately. See spjCheck() above.
3567 */
3568
3569 bool result = true;
3570
3571 {
3572 Ptr<TreeNode> treeNodePtr;
3573 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
3574 for (list.first(treeNodePtr);
3575 !treeNodePtr.isNull();
3576 list.next(treeNodePtr))
3577 {
3578 jam();
3579 ndbrequire(treeNodePtr.p->m_info != NULL);
3580 if (treeNodePtr.p->m_info->m_checkNode != NULL)
3581 {
3582 jam();
3583 result &= (this->*(treeNodePtr.p->m_info->m_checkNode))
3584 (requestPtr, treeNodePtr);
3585 }
3586 }
3587 }
3588
3589 if (!result)
3590 {
3591 dumpRequest("failed checkRequest()",
3592 requestPtr);
3593 ndbabort();
3594 }
3595
3596 return result;
3597 }
3598
3599 /**
3600 * Processing of signals from LQH
3601 */
3602 void
execLQHKEYREF(Signal * signal)3603 Dbspj::execLQHKEYREF(Signal* signal)
3604 {
3605 jamEntry();
3606
3607 const LqhKeyRef* ref = reinterpret_cast<const LqhKeyRef*>(signal->getDataPtr());
3608
3609 Ptr<TreeNode> treeNodePtr;
3610 m_treenode_pool.getPtr(treeNodePtr, ref->connectPtr);
3611
3612 Ptr<Request> requestPtr;
3613 m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3614 ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
3615
3616 ndbassert(checkRequest(requestPtr));
3617
3618 DEBUG("execLQHKEYREF"
3619 << ", node: " << treeNodePtr.p->m_node_no
3620 << ", request: " << requestPtr.i
3621 << ", errorCode: " << ref->errorCode
3622 );
3623
3624 ndbrequire(treeNodePtr.p->m_info && treeNodePtr.p->m_info->m_execLQHKEYREF);
3625 (this->*(treeNodePtr.p->m_info->m_execLQHKEYREF))(signal,
3626 requestPtr,
3627 treeNodePtr);
3628 jam();
3629 checkBatchComplete(signal, requestPtr);
3630 }
3631
3632 void
execLQHKEYCONF(Signal * signal)3633 Dbspj::execLQHKEYCONF(Signal* signal)
3634 {
3635 jamEntry();
3636
3637 const LqhKeyConf* conf = reinterpret_cast<const LqhKeyConf*>(signal->getDataPtr());
3638 Ptr<TreeNode> treeNodePtr;
3639 m_treenode_pool.getPtr(treeNodePtr, conf->opPtr);
3640
3641 Ptr<Request> requestPtr;
3642 m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3643 ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
3644
3645 DEBUG("execLQHKEYCONF"
3646 << ", node: " << treeNodePtr.p->m_node_no
3647 << ", request: " << requestPtr.i
3648 );
3649
3650 ndbrequire(treeNodePtr.p->m_info && treeNodePtr.p->m_info->m_execLQHKEYCONF);
3651 (this->*(treeNodePtr.p->m_info->m_execLQHKEYCONF))(signal,
3652 requestPtr,
3653 treeNodePtr);
3654 jam();
3655 checkBatchComplete(signal, requestPtr);
3656 }
3657
3658 void
execSCAN_FRAGREF(Signal * signal)3659 Dbspj::execSCAN_FRAGREF(Signal* signal)
3660 {
3661 jamEntry();
3662 const ScanFragRef* ref = reinterpret_cast<const ScanFragRef*>(signal->getDataPtr());
3663
3664 Ptr<ScanFragHandle> scanFragHandlePtr;
3665 m_scanfraghandle_pool.getPtr(scanFragHandlePtr, ref->senderData);
3666 Ptr<TreeNode> treeNodePtr;
3667 m_treenode_pool.getPtr(treeNodePtr, scanFragHandlePtr.p->m_treeNodePtrI);
3668 Ptr<Request> requestPtr;
3669 m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3670 ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
3671
3672 ndbassert(checkRequest(requestPtr));
3673
3674 DEBUG("execSCAN_FRAGREF"
3675 << ", node: " << treeNodePtr.p->m_node_no
3676 << ", request: " << requestPtr.i
3677 << ", errorCode: " << ref->errorCode
3678 );
3679
3680 ndbrequire(treeNodePtr.p->m_info&&treeNodePtr.p->m_info->m_execSCAN_FRAGREF);
3681 (this->*(treeNodePtr.p->m_info->m_execSCAN_FRAGREF))(signal,
3682 requestPtr,
3683 treeNodePtr,
3684 scanFragHandlePtr);
3685 jam();
3686 checkBatchComplete(signal, requestPtr);
3687 }
3688
3689 void
execSCAN_HBREP(Signal * signal)3690 Dbspj::execSCAN_HBREP(Signal* signal)
3691 {
3692 jamEntry();
3693
3694 Uint32 senderData = signal->theData[0];
3695 //Uint32 transId[2] = { signal->theData[1], signal->theData[2] };
3696
3697 Ptr<ScanFragHandle> scanFragHandlePtr;
3698 m_scanfraghandle_pool.getPtr(scanFragHandlePtr, senderData);
3699 Ptr<TreeNode> treeNodePtr;
3700 m_treenode_pool.getPtr(treeNodePtr, scanFragHandlePtr.p->m_treeNodePtrI);
3701 Ptr<Request> requestPtr;
3702 m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3703 DEBUG("execSCAN_HBREP"
3704 << ", node: " << treeNodePtr.p->m_node_no
3705 << ", request: " << requestPtr.i
3706 );
3707
3708 Uint32 ref = requestPtr.p->m_senderRef;
3709 signal->theData[0] = requestPtr.p->m_senderData;
3710 sendSignal(ref, GSN_SCAN_HBREP, signal, 3, JBB);
3711 }
3712
3713 void
execSCAN_FRAGCONF(Signal * signal)3714 Dbspj::execSCAN_FRAGCONF(Signal* signal)
3715 {
3716 jamEntry();
3717
3718 const ScanFragConf* conf = reinterpret_cast<const ScanFragConf*>(signal->getDataPtr());
3719
3720 #ifdef DEBUG_SCAN_FRAGREQ
3721 ndbout_c("Dbspj::execSCAN_FRAGCONF() receiving SCAN_FRAGCONF ");
3722 printSCAN_FRAGCONF(stdout, signal->getDataPtrSend(),
3723 conf->total_len,
3724 DBLQH);
3725 #endif
3726
3727 Ptr<ScanFragHandle> scanFragHandlePtr;
3728 m_scanfraghandle_pool.getPtr(scanFragHandlePtr, conf->senderData);
3729 Ptr<TreeNode> treeNodePtr;
3730 m_treenode_pool.getPtr(treeNodePtr, scanFragHandlePtr.p->m_treeNodePtrI);
3731 Ptr<Request> requestPtr;
3732 m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3733
3734 ndbassert(checkRequest(requestPtr));
3735
3736 ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no) ||
3737 requestPtr.p->m_state & Request::RS_ABORTING);
3738
3739 DEBUG("execSCAN_FRAGCONF"
3740 << ", node: " << treeNodePtr.p->m_node_no
3741 << ", request: " << requestPtr.i
3742 );
3743
3744 ndbrequire(treeNodePtr.p->m_info&&treeNodePtr.p->m_info->m_execSCAN_FRAGCONF);
3745 (this->*(treeNodePtr.p->m_info->m_execSCAN_FRAGCONF))(signal,
3746 requestPtr,
3747 treeNodePtr,
3748 scanFragHandlePtr);
3749 jam();
3750 checkBatchComplete(signal, requestPtr);
3751 }
3752
3753 void
execSCAN_NEXTREQ(Signal * signal)3754 Dbspj::execSCAN_NEXTREQ(Signal* signal)
3755 {
3756 jamEntry();
3757 const ScanFragNextReq * req = (ScanFragNextReq*)&signal->theData[0];
3758
3759 #ifdef DEBUG_SCAN_FRAGREQ
3760 DEBUG("Incomming SCAN_NEXTREQ");
3761 printSCANFRAGNEXTREQ(stdout, &signal->theData[0],
3762 ScanFragNextReq::SignalLength, DBLQH);
3763 #endif
3764
3765 Request key;
3766 key.m_transId[0] = req->transId1;
3767 key.m_transId[1] = req->transId2;
3768 key.m_senderData = req->senderData;
3769
3770 Ptr<Request> requestPtr;
3771 if (unlikely(!m_scan_request_hash.find(requestPtr, key)))
3772 {
3773 jam();
3774 ndbrequire(ScanFragNextReq::getCloseFlag(req->requestInfo));
3775 return;
3776 }
3777 DEBUG("execSCAN_NEXTREQ, request: " << requestPtr.i);
3778
3779 #ifdef SPJ_TRACE_TIME
3780 const NDB_TICKS now = NdbTick_getCurrentTicks();
3781 const NDB_TICKS then = requestPtr.p->m_save_time;
3782 const Uint64 diff = NdbTick_Elapsed(then,now).microSec();
3783 requestPtr.p->m_sum_waiting += Uint32(diff);
3784 requestPtr.p->m_save_time = now;
3785 #endif
3786
3787 ndbassert(checkRequest(requestPtr));
3788
3789 Uint32 state = requestPtr.p->m_state;
3790 requestPtr.p->m_state = state & ~Uint32(Request::RS_WAITING);
3791
3792 do //Not a loop, allows 'break' to common exit/error handling.
3793 {
3794 /**
3795 * A RS_ABORTED query is a 'toombstone' left behind when a
3796 * RS_WAITING query was aborted by node failues. The idea is
3797 * that the next SCAN_NEXTREQ will reply with the abort reason
3798 * and clean up.
3799 *
3800 * TODO: This doesn't seems to happen as assumed by design,
3801 * Thus, RS_ABORTED queries are likely leaked!
3802 */
3803 if (unlikely(state == Request::RS_ABORTED))
3804 {
3805 jam();
3806 break;
3807 }
3808 if (unlikely((state & Request::RS_ABORTING) != 0))
3809 {
3810 /**
3811 * abort is already in progress...
3812 * since RS_WAITING is cleared...it will end this request
3813 */
3814 jam();
3815 break;
3816 }
3817 if (ScanFragNextReq::getCloseFlag(req->requestInfo)) // Requested close scan
3818 {
3819 jam();
3820 abort(signal, requestPtr, 0); //Stop query, no error
3821 break;
3822 }
3823
3824 ndbrequire((state & Request::RS_WAITING) != 0);
3825 ndbrequire(requestPtr.p->m_outstanding == 0);
3826
3827 /**
3828 * Scroll all relevant cursors...
3829 */
3830 Ptr<TreeNode> treeNodePtr;
3831 Local_TreeNodeCursor_list list(m_treenode_pool,
3832 requestPtr.p->m_cursor_nodes);
3833 Uint32 cnt_active = 0;
3834
3835 for (list.first(treeNodePtr); !treeNodePtr.isNull(); list.next(treeNodePtr))
3836 {
3837 if (treeNodePtr.p->m_state == TreeNode::TN_ACTIVE)
3838 {
3839 jam();
3840 DEBUG("SCAN_NEXTREQ on TreeNode: "
3841 << ", m_node_no: " << treeNodePtr.p->m_node_no
3842 << ", w/ m_parentPtrI: " << treeNodePtr.p->m_parentPtrI);
3843
3844 ndbrequire(treeNodePtr.p->m_info != 0 &&
3845 treeNodePtr.p->m_info->m_execSCAN_NEXTREQ != 0);
3846 (this->*(treeNodePtr.p->m_info->m_execSCAN_NEXTREQ))(signal,
3847 requestPtr,
3848 treeNodePtr);
3849 cnt_active++;
3850 }
3851 else
3852 {
3853 /**
3854 * Restart any other scans not being 'TN_ACTIVE'
3855 * (Only effective if 'RT_REPEAT_SCAN_RESULT')
3856 */
3857 jam();
3858 ndbrequire(requestPtr.p->m_bits & Request::RT_REPEAT_SCAN_RESULT);
3859 DEBUG("Restart TreeNode "
3860 << ", m_node_no: " << treeNodePtr.p->m_node_no
3861 << ", w/ m_parentPtrI: " << treeNodePtr.p->m_parentPtrI);
3862
3863 ndbrequire(treeNodePtr.p->m_info != 0 &&
3864 treeNodePtr.p->m_info->m_parent_batch_complete !=0 );
3865 (this->*(treeNodePtr.p->m_info->m_parent_batch_complete))(signal,
3866 requestPtr,
3867 treeNodePtr);
3868 }
3869 if (unlikely((requestPtr.p->m_state & Request::RS_ABORTING) != 0))
3870 {
3871 jam();
3872 break;
3873 }
3874 }// for all treeNodes in 'm_cursor_nodes'
3875
3876 /* Expected only a single ACTIVE TreeNode among the cursors */
3877 ndbrequire(cnt_active == 1 ||
3878 !(requestPtr.p->m_bits & Request::RT_REPEAT_SCAN_RESULT));
3879 }
3880 while (0);
3881
3882 // If nothing restarted, or failed, we have to handle completion
3883 jam();
3884 checkBatchComplete(signal, requestPtr);
3885 }
3886
3887 void
execTRANSID_AI(Signal * signal)3888 Dbspj::execTRANSID_AI(Signal* signal)
3889 {
3890 jamEntry();
3891 TransIdAI * req = (TransIdAI *)signal->getDataPtr();
3892 Uint32 ptrI = req->connectPtr;
3893
3894 Ptr<TreeNode> treeNodePtr;
3895 m_treenode_pool.getPtr(treeNodePtr, ptrI);
3896 Ptr<Request> requestPtr;
3897 m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
3898
3899 ndbassert(checkRequest(requestPtr));
3900 ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
3901 ndbassert(treeNodePtr.p->m_bits & TreeNode::T_EXPECT_TRANSID_AI);
3902
3903 DEBUG("execTRANSID_AI"
3904 << ", node: " << treeNodePtr.p->m_node_no
3905 << ", request: " << requestPtr.i
3906 );
3907
3908 ndbrequire(signal->getNoOfSections() != 0);
3909
3910 SegmentedSectionPtr dataPtr;
3911 {
3912 SectionHandle handle(this, signal);
3913 handle.getSection(dataPtr, 0);
3914 handle.clear();
3915 }
3916
3917 #if defined(DEBUG_LQHKEYREQ) || defined(DEBUG_SCAN_FRAGREQ)
3918 printf("execTRANSID_AI: ");
3919 print(dataPtr, stdout);
3920 #endif
3921
3922 /**
3923 * Register signal as arrived.
3924 */
3925 ndbassert(treeNodePtr.p->m_info&&treeNodePtr.p->m_info->m_countSignal);
3926 (this->*(treeNodePtr.p->m_info->m_countSignal))(signal,
3927 requestPtr,
3928 treeNodePtr, 1);
3929
3930 /**
3931 * build easy-access-array for row
3932 */
3933 Uint32 tmp[2+MAX_ATTRIBUTES_IN_TABLE];
3934 RowPtr::Header* header = CAST_PTR(RowPtr::Header, &tmp[0]);
3935
3936 Uint32 cnt = buildRowHeader(header, dataPtr);
3937 ndbassert(header->m_len < NDB_ARRAY_SIZE(tmp));
3938
3939 struct RowPtr row;
3940 row.m_type = RowPtr::RT_SECTION;
3941 row.m_matched = NULL;
3942 row.m_src_node_ptrI = treeNodePtr.i;
3943 row.m_row_data.m_section.m_header = header;
3944 row.m_row_data.m_section.m_dataPtr.assign(dataPtr);
3945
3946 getCorrelationData(row.m_row_data.m_section,
3947 cnt - 1,
3948 row.m_src_correlation);
3949
3950 do //Dummy loop to allow 'break' into error handling
3951 {
3952 if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ANY)
3953 {
3954 jam();
3955 Uint32 err;
3956
3957 DEBUG("Need to storeRow"
3958 << ", node: " << treeNodePtr.p->m_node_no
3959 );
3960
3961 if (ERROR_INSERTED(17120) ||
3962 (ERROR_INSERTED(17121) && treeNodePtr.p->m_parentPtrI != RNIL) ||
3963 (ERROR_INSERTED(17122) && refToNode(signal->getSendersBlockRef()) != getOwnNodeId()))
3964 {
3965 jam();
3966 CLEAR_ERROR_INSERT_VALUE;
3967 abort(signal, requestPtr, DbspjErr::OutOfRowMemory);
3968 break;
3969 }
3970 else if ((err = storeRow(treeNodePtr, row)) != 0)
3971 {
3972 jam();
3973 abort(signal, requestPtr, err);
3974 break;
3975 }
3976 }
3977 common_execTRANSID_AI(signal, requestPtr, treeNodePtr, row);
3978 }
3979 while(0);
3980
3981 release(dataPtr);
3982
3983 /**
3984 * When TreeNode is completed we might have to reply, or
3985 * resume other parts of the request.
3986 */
3987 if (requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no))
3988 {
3989 jam();
3990 handleTreeNodeComplete(signal, requestPtr, treeNodePtr);
3991 }
3992
3993 jam();
3994 checkBatchComplete(signal, requestPtr);
3995 }
3996
3997 Uint32
storeRow(Ptr<TreeNode> treeNodePtr,const RowPtr & row)3998 Dbspj::storeRow(Ptr<TreeNode> treeNodePtr, const RowPtr &row)
3999 {
4000 ndbassert(row.m_type == RowPtr::RT_SECTION);
4001 RowCollection& collection = treeNodePtr.p->m_rows;
4002 SegmentedSectionPtr dataPtr = row.m_row_data.m_section.m_dataPtr;
4003 Uint32 datalen;
4004 Uint32 *headptr;
4005 Uint32 headlen;
4006
4007 Uint32 tmpHeader[2];
4008 if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ROW)
4009 {
4010 headptr = (Uint32*)row.m_row_data.m_section.m_header;
4011 headlen = 1 + row.m_row_data.m_section.m_header->m_len;
4012 datalen = dataPtr.sz;
4013 }
4014 else
4015 {
4016 // Build a header for only the 1-word correlation
4017 RowPtr::Header *header = CAST_PTR(RowPtr::Header, &tmpHeader[0]);
4018 header->m_len = 1;
4019 header->m_offset[0] = 0;
4020 headptr = (Uint32*)header;
4021 headlen = 1 + header->m_len;
4022
4023 // 2 words: AttributeHeader + CorrelationId
4024 datalen = 2;
4025 }
4026
4027 /**
4028 * Rows might be stored at an offset within the collection.
4029 * Calculate size to allocate for buffer.
4030 */
4031 const Uint32 offset = collection.rowOffset();
4032 const Uint32 matchlen =
4033 (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MATCH) ? 1 : 0;
4034 const Uint32 totlen = offset + matchlen + headlen + datalen;
4035
4036 RowRef ref;
4037 Uint32* dstptr = rowAlloc(*collection.m_base.m_rowBuffer, ref, totlen);
4038 if (unlikely(dstptr == NULL))
4039 {
4040 jam();
4041 return DbspjErr::OutOfRowMemory;
4042 }
4043 Uint32 * const saved_dstptr = dstptr;
4044 dstptr += offset;
4045
4046 // Insert 'MATCH', Header and 'ROW'/correlationId as specified
4047 if (matchlen > 0)
4048 {
4049 TreeNodeBitMask matched(treeNodePtr.p->m_dependencies);
4050 matched.set(treeNodePtr.p->m_node_no);
4051 memcpy(dstptr, &matched, 4 * matchlen);
4052 dstptr += matchlen;
4053 }
4054
4055 memcpy(dstptr, headptr, 4 * headlen);
4056 dstptr += headlen;
4057
4058 if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ROW)
4059 {
4060 //Store entire row, include correlationId (last column)
4061 copy(dstptr, dataPtr);
4062 }
4063 else
4064 {
4065 //Store only the correlation-id if not 'BUFFER_ROW':
4066 const RowPtr::Header *header = row.m_row_data.m_section.m_header;
4067 const Uint32 pos = header->m_offset[header->m_len-1];
4068 SectionReader reader(dataPtr, getSectionSegmentPool());
4069 ndbrequire(reader.step(pos));
4070 ndbrequire(reader.getWords(dstptr, 2));
4071 }
4072
4073 /**
4074 * Register row in a list or a correlationId searchable 'map'
4075 * Note that add_to_xxx may relocate entire memory area which
4076 * 'dstptr' referred, so it is not safe to use 'dstptr' *after*
4077 * the add_to_* below.
4078 */
4079 if (collection.m_type == RowCollection::COLLECTION_LIST)
4080 {
4081 NullRowRef.copyto_link(saved_dstptr); // Null terminate list...
4082 add_to_list(collection.m_list, ref);
4083 }
4084 else
4085 {
4086 Uint32 error = add_to_map(collection.m_map, row.m_src_correlation, ref);
4087 if (unlikely(error))
4088 return error;
4089 }
4090
4091 return 0;
4092 }
4093
4094 void
setupRowPtr(Ptr<TreeNode> treeNodePtr,RowPtr & row,RowRef ref,const Uint32 * src)4095 Dbspj::setupRowPtr(Ptr<TreeNode> treeNodePtr,
4096 RowPtr& row, RowRef ref, const Uint32 * src)
4097 {
4098 ndbassert(src != NULL);
4099 const Uint32 offset = treeNodePtr.p->m_rows.rowOffset();
4100 const Uint32 matchlen =
4101 (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MATCH) ? 1 : 0;
4102 const RowPtr::Header * headptr = (RowPtr::Header*)(src + offset + matchlen);
4103 const Uint32 headlen = 1 + headptr->m_len;
4104
4105 // Setup row, containing either entire row or only the correlationId.
4106 row.m_type = RowPtr::RT_LINEAR;
4107 row.m_row_data.m_linear.m_row_ref = ref;
4108 row.m_row_data.m_linear.m_header = headptr;
4109 row.m_row_data.m_linear.m_data = (Uint32*)headptr + headlen;
4110
4111 if (treeNodePtr.p->m_bits & TreeNode::T_BUFFER_MATCH)
4112 {
4113 row.m_matched = (TreeNodeBitMask*)(src + offset);
4114 }
4115 else
4116 {
4117 row.m_matched = NULL;
4118 }
4119 }
4120
4121 void
add_to_list(SLFifoRowList & list,RowRef rowref)4122 Dbspj::add_to_list(SLFifoRowList & list, RowRef rowref)
4123 {
4124 if (list.isNull())
4125 {
4126 jam();
4127 list.m_first_row_page_id = rowref.m_page_id;
4128 list.m_first_row_page_pos = rowref.m_page_pos;
4129 }
4130 else
4131 {
4132 jam();
4133 /**
4134 * add last to list
4135 */
4136 RowRef last;
4137 last.m_alloc_type = rowref.m_alloc_type;
4138 last.m_page_id = list.m_last_row_page_id;
4139 last.m_page_pos = list.m_last_row_page_pos;
4140 Uint32 * const rowptr = get_row_ptr(last);
4141 rowref.copyto_link(rowptr);
4142 }
4143
4144 list.m_last_row_page_id = rowref.m_page_id;
4145 list.m_last_row_page_pos = rowref.m_page_pos;
4146 }
4147
4148 Uint32 *
get_row_ptr(RowRef pos)4149 Dbspj::get_row_ptr(RowRef pos)
4150 {
4151 Ptr<RowPage> ptr;
4152 m_page_pool.getPtr(ptr, pos.m_page_id);
4153 if (pos.m_alloc_type == BUFFER_STACK) // ::stackAlloc() memory
4154 {
4155 jam();
4156 return ptr.p->m_data + pos.m_page_pos;
4157 }
4158 else // ::varAlloc() memory
4159 {
4160 jam();
4161 ndbassert(pos.m_alloc_type == BUFFER_VAR);
4162 return ((Var_page*)ptr.p)->get_ptr(pos.m_page_pos);
4163 }
4164 }
4165
4166 inline
4167 bool
first(const SLFifoRowList & list,SLFifoRowListIterator & iter)4168 Dbspj::first(const SLFifoRowList& list,
4169 SLFifoRowListIterator& iter)
4170 {
4171 if (list.isNull())
4172 {
4173 jam();
4174 iter.setNull();
4175 return false;
4176 }
4177
4178 // const Buffer_type allocator = list.m_rowBuffer->m_type;
4179 iter.m_ref.m_alloc_type = list.m_rowBuffer->m_type;
4180 iter.m_ref.m_page_id = list.m_first_row_page_id;
4181 iter.m_ref.m_page_pos = list.m_first_row_page_pos;
4182 iter.m_row_ptr = get_row_ptr(iter.m_ref);
4183 return true;
4184 }
4185
4186 inline
4187 bool
next(SLFifoRowListIterator & iter)4188 Dbspj::next(SLFifoRowListIterator& iter)
4189 {
4190 iter.m_ref.assign_from_link(iter.m_row_ptr);
4191 if (iter.m_ref.isNull())
4192 {
4193 jam();
4194 return false;
4195 }
4196 iter.m_row_ptr = get_row_ptr(iter.m_ref);
4197 return true;
4198 }
4199
4200 Uint32
add_to_map(RowMap & map,Uint32 corrVal,RowRef rowref)4201 Dbspj::add_to_map(RowMap& map,
4202 Uint32 corrVal, RowRef rowref)
4203 {
4204 Uint32 * mapptr;
4205 if (unlikely(map.isNull()))
4206 {
4207 jam();
4208 ndbassert(map.m_size > 0);
4209 ndbassert(map.m_rowBuffer != NULL);
4210
4211 Uint32 sz16 = RowMap::MAP_SIZE_PER_REF_16 * map.m_size;
4212 Uint32 sz32 = (sz16 + 1) / 2;
4213 RowRef ref;
4214 mapptr = rowAlloc(*map.m_rowBuffer, ref, sz32);
4215 if (unlikely(mapptr == 0))
4216 {
4217 jam();
4218 return DbspjErr::OutOfRowMemory;
4219 }
4220 map.assign(ref);
4221 map.m_elements = 0;
4222 map.clear(mapptr);
4223 }
4224 else
4225 {
4226 jam();
4227 RowRef ref;
4228 map.copyto(ref);
4229 mapptr = get_row_ptr(ref);
4230 }
4231
4232 Uint32 pos = corrVal & 0xFFFF;
4233 ndbrequire(pos < map.m_size);
4234 ndbrequire(map.m_elements < map.m_size);
4235
4236 if (1)
4237 {
4238 /**
4239 * Check that *pos* is empty
4240 */
4241 RowRef check;
4242 map.load(mapptr, pos, check);
4243 ndbrequire(check.m_page_pos == 0xFFFF);
4244 }
4245
4246 map.store(mapptr, pos, rowref);
4247
4248 return 0;
4249 }
4250
4251 inline
4252 bool
first(const RowMap & map,RowMapIterator & iter)4253 Dbspj::first(const RowMap& map,
4254 RowMapIterator & iter)
4255 {
4256 if (map.isNull())
4257 {
4258 jam();
4259 iter.setNull();
4260 return false;
4261 }
4262
4263 iter.m_map_ptr = get_row_ptr(map.m_map_ref);
4264 iter.m_size = map.m_size;
4265 iter.m_ref.m_alloc_type = map.m_rowBuffer->m_type;
4266
4267 Uint32 pos = 0;
4268 while (RowMap::isNull(iter.m_map_ptr, pos) && pos < iter.m_size)
4269 pos++;
4270
4271 if (pos == iter.m_size)
4272 {
4273 jam();
4274 iter.setNull();
4275 return false;
4276 }
4277 else
4278 {
4279 jam();
4280 RowMap::load(iter.m_map_ptr, pos, iter.m_ref);
4281 iter.m_element_no = pos;
4282 iter.m_row_ptr = get_row_ptr(iter.m_ref);
4283 return true;
4284 }
4285 }
4286
4287 inline
4288 bool
next(RowMapIterator & iter)4289 Dbspj::next(RowMapIterator & iter)
4290 {
4291 Uint32 pos = iter.m_element_no + 1;
4292 while (RowMap::isNull(iter.m_map_ptr, pos) && pos < iter.m_size)
4293 pos++;
4294
4295 if (pos == iter.m_size)
4296 {
4297 jam();
4298 iter.setNull();
4299 return false;
4300 }
4301 else
4302 {
4303 jam();
4304 RowMap::load(iter.m_map_ptr, pos, iter.m_ref);
4305 iter.m_element_no = pos;
4306 iter.m_row_ptr = get_row_ptr(iter.m_ref);
4307 return true;
4308 }
4309 }
4310
4311 bool
first(const RowCollection & collection,RowIterator & iter)4312 Dbspj::first(const RowCollection& collection,
4313 RowIterator& iter)
4314 {
4315 iter.m_type = collection.m_type;
4316 if (iter.m_type == RowCollection::COLLECTION_LIST)
4317 {
4318 jam();
4319 return first(collection.m_list, iter.m_list);
4320 }
4321 else
4322 {
4323 jam();
4324 ndbassert(iter.m_type == RowCollection::COLLECTION_MAP);
4325 return first(collection.m_map, iter.m_map);
4326 }
4327 }
4328
4329 bool
next(RowIterator & iter)4330 Dbspj::next(RowIterator& iter)
4331 {
4332 if (iter.m_type == RowCollection::COLLECTION_LIST)
4333 {
4334 jam();
4335 return next(iter.m_list);
4336 }
4337 else
4338 {
4339 jam();
4340 ndbassert(iter.m_type == RowCollection::COLLECTION_MAP);
4341 return next(iter.m_map);
4342 }
4343 }
4344
4345 inline
4346 Uint32 *
stackAlloc(RowBuffer & buffer,RowRef & dst,Uint32 sz)4347 Dbspj::stackAlloc(RowBuffer & buffer, RowRef& dst, Uint32 sz)
4348 {
4349 Ptr<RowPage> ptr;
4350 Local_RowPage_fifo list(m_page_pool, buffer.m_page_list);
4351
4352 Uint32 pos = buffer.m_stack.m_pos;
4353 const Uint32 SIZE = RowPage::SIZE;
4354 if (list.isEmpty() || (pos + sz) > SIZE)
4355 {
4356 jam();
4357 bool ret = allocPage(ptr);
4358 if (unlikely(ret == false))
4359 {
4360 jam();
4361 return 0;
4362 }
4363
4364 pos = 0;
4365 list.addLast(ptr);
4366 }
4367 else
4368 {
4369 list.last(ptr);
4370 }
4371
4372 dst.m_page_id = ptr.i;
4373 dst.m_page_pos = pos;
4374 dst.m_alloc_type = BUFFER_STACK;
4375 buffer.m_stack.m_pos = pos + sz;
4376 return ptr.p->m_data + pos;
4377 }
4378
4379 inline
4380 Uint32 *
varAlloc(RowBuffer & buffer,RowRef & dst,Uint32 sz)4381 Dbspj::varAlloc(RowBuffer & buffer, RowRef& dst, Uint32 sz)
4382 {
4383 Ptr<RowPage> ptr;
4384 Local_RowPage_fifo list(m_page_pool, buffer.m_page_list);
4385
4386 Uint32 free_space = buffer.m_var.m_free;
4387 if (list.isEmpty() || free_space < (sz + 1))
4388 {
4389 jam();
4390 bool ret = allocPage(ptr);
4391 if (unlikely(ret == false))
4392 {
4393 jam();
4394 return 0;
4395 }
4396
4397 list.addLast(ptr);
4398 ((Var_page*)ptr.p)->init();
4399 }
4400 else
4401 {
4402 jam();
4403 list.last(ptr);
4404 }
4405
4406 Var_page * vp = (Var_page*)ptr.p;
4407 Uint32 pos = vp->alloc_record(sz, (Var_page*)m_buffer0, Var_page::CHAIN);
4408
4409 dst.m_page_id = ptr.i;
4410 dst.m_page_pos = pos;
4411 dst.m_alloc_type = BUFFER_VAR;
4412 buffer.m_var.m_free = vp->free_space;
4413 return vp->get_ptr(pos);
4414 }
4415
4416 Uint32 *
rowAlloc(RowBuffer & rowBuffer,RowRef & dst,Uint32 sz)4417 Dbspj::rowAlloc(RowBuffer& rowBuffer, RowRef& dst, Uint32 sz)
4418 {
4419 if (rowBuffer.m_type == BUFFER_STACK)
4420 {
4421 jam();
4422 return stackAlloc(rowBuffer, dst, sz);
4423 }
4424 else if (rowBuffer.m_type == BUFFER_VAR)
4425 {
4426 jam();
4427 return varAlloc(rowBuffer, dst, sz);
4428 }
4429 else
4430 {
4431 jam();
4432 ndbabort();
4433 return NULL;
4434 }
4435 }
4436
4437 bool
allocPage(Ptr<RowPage> & ptr)4438 Dbspj::allocPage(Ptr<RowPage> & ptr)
4439 {
4440 if (m_free_page_list.isEmpty())
4441 {
4442 jam();
4443 if (ERROR_INSERTED_CLEAR(17003))
4444 {
4445 jam();
4446 ndbout_c("Injecting failed '::allocPage', error 17003 at line %d file %s",
4447 __LINE__, __FILE__);
4448 return false;
4449 }
4450 ptr.p = (RowPage*)m_ctx.m_mm.alloc_page(RT_SPJ_DATABUFFER,
4451 &ptr.i,
4452 Ndbd_mem_manager::NDB_ZONE_LE_32);
4453 if (ptr.p == 0)
4454 {
4455 jam();
4456 return false;
4457 }
4458 return true;
4459 }
4460 else
4461 {
4462 jam();
4463 Local_RowPage_list list(m_page_pool, m_free_page_list);
4464 bool ret = list.removeFirst(ptr);
4465 ndbrequire(ret);
4466 return ret;
4467 }
4468 }
4469
4470 void
releasePage(Ptr<RowPage> ptr)4471 Dbspj::releasePage(Ptr<RowPage> ptr)
4472 {
4473 Local_RowPage_list list(m_page_pool, m_free_page_list);
4474 list.addFirst(ptr);
4475 }
4476
4477 void
releaseGlobal(Signal * signal)4478 Dbspj::releaseGlobal(Signal * signal)
4479 {
4480 Uint32 delay = 100;
4481 Local_RowPage_list list(m_page_pool, m_free_page_list);
4482 if (list.isEmpty())
4483 {
4484 jam();
4485 delay = 300;
4486 }
4487 else
4488 {
4489 Ptr<RowPage> ptr;
4490 list.removeFirst(ptr);
4491 m_ctx.m_mm.release_page(RT_SPJ_DATABUFFER, ptr.i);
4492 }
4493
4494 signal->theData[0] = 0;
4495 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, delay, 1);
4496 }
4497
4498 Uint32
checkTableError(Ptr<TreeNode> treeNodePtr) const4499 Dbspj::checkTableError(Ptr<TreeNode> treeNodePtr) const
4500 {
4501 jam();
4502 if (treeNodePtr.p->m_tableOrIndexId >= c_tabrecFilesize)
4503 {
4504 jam();
4505 ndbassert(c_tabrecFilesize > 0);
4506 return DbspjErr::NoSuchTable;
4507 }
4508
4509 TableRecordPtr tablePtr;
4510 tablePtr.i = treeNodePtr.p->m_tableOrIndexId;
4511 ptrAss(tablePtr, m_tableRecord);
4512 Uint32 err = tablePtr.p->checkTableError(treeNodePtr.p->m_schemaVersion);
4513 if (unlikely(err))
4514 {
4515 DEBUG_DICT("Dbsp::checkTableError"
4516 << ", m_node_no: " << treeNodePtr.p->m_node_no
4517 << ", tableOrIndexId: " << treeNodePtr.p->m_tableOrIndexId
4518 << ", error: " << err);
4519 }
4520 if (ERROR_INSERTED(17520) ||
4521 (ERROR_INSERTED(17521) && (rand() % 7) == 0))
4522 {
4523 jam();
4524 CLEAR_ERROR_INSERT_VALUE;
4525 ndbout_c("::checkTableError, injecting NoSuchTable error at line %d file %s",
4526 __LINE__, __FILE__);
4527 return DbspjErr::NoSuchTable;
4528 }
4529 return err;
4530 }
4531
4532 void
dumpScanFragHandle(Ptr<ScanFragHandle> fragPtr) const4533 Dbspj::dumpScanFragHandle(Ptr<ScanFragHandle> fragPtr) const
4534 {
4535 jam();
4536
4537 g_eventLogger->info("DBSPJ %u : SFH fragid %u state %u ref 0x%x "
4538 "rangePtr 0x%x",
4539 instance(),
4540 fragPtr.p->m_fragId,
4541 fragPtr.p->m_state,
4542 fragPtr.p->m_ref,
4543 fragPtr.p->m_rangePtrI);
4544 }
4545
4546
4547 void
dumpNodeCommon(const Ptr<TreeNode> treeNodePtr) const4548 Dbspj::dumpNodeCommon(const Ptr<TreeNode> treeNodePtr) const
4549 {
4550 jam();
4551
4552 g_eventLogger->info("DBSPJ %u : TreeNode (%u) (0x%x:%p) state %u bits 0x%x "
4553 "tableid %u schVer 0x%x",
4554 instance(),
4555 treeNodePtr.p->m_node_no,
4556 treeNodePtr.i,
4557 treeNodePtr.p,
4558 treeNodePtr.p->m_state,
4559 treeNodePtr.p->m_bits,
4560 treeNodePtr.p->m_tableOrIndexId,
4561 treeNodePtr.p->m_schemaVersion);
4562 g_eventLogger->info("DBSPJ %u : TreeNode (%u) ptableId %u ref 0x%x "
4563 "correlation %u parentPtrI 0x%x",
4564 instance(),
4565 treeNodePtr.p->m_node_no,
4566 treeNodePtr.p->m_primaryTableId,
4567 treeNodePtr.p->m_send.m_ref,
4568 treeNodePtr.p->m_send.m_correlation,
4569 treeNodePtr.p->m_parentPtrI);
4570
4571 }
4572
4573 void
dumpRequest(const char * reason,const Ptr<Request> requestPtr)4574 Dbspj::dumpRequest(const char* reason,
4575 const Ptr<Request> requestPtr)
4576 {
4577 jam();
4578
4579 /* TODO Add to DUMP_STATE_ORD */
4580
4581 g_eventLogger->info("DBSPJ %u : Dumping request (0x%x:%p) due to %s.",
4582 instance(),
4583 requestPtr.i,
4584 requestPtr.p,
4585 reason);
4586
4587 g_eventLogger->info("DBSPJ %u : Request state %u bits 0x%x errCode %u "
4588 "senderRef 0x%x rootFragId %u",
4589 instance(),
4590 requestPtr.p->m_state,
4591 requestPtr.p->m_bits,
4592 requestPtr.p->m_errCode,
4593 requestPtr.p->m_senderRef,
4594 requestPtr.p->m_rootFragId);
4595
4596 g_eventLogger->info("DBSPJ %u : Request transid (0x%x 0x%x) node_cnt %u "
4597 "active_cnt %u m_outstanding %u",
4598 instance(),
4599 requestPtr.p->m_transId[0],
4600 requestPtr.p->m_transId[1],
4601 requestPtr.p->m_node_cnt,
4602 requestPtr.p->m_cnt_active,
4603 requestPtr.p->m_outstanding);
4604
4605 /* Iterate over request's nodes */
4606 {
4607 Ptr<TreeNode> treeNodePtr;
4608 Local_TreeNode_list list(m_treenode_pool, requestPtr.p->m_nodes);
4609 for (list.first(treeNodePtr);
4610 !treeNodePtr.isNull();
4611 list.next(treeNodePtr))
4612 {
4613 jam();
4614 ndbrequire(treeNodePtr.p->m_info != NULL);
4615
4616 dumpNodeCommon(treeNodePtr);
4617
4618 if (treeNodePtr.p->m_info->m_dumpNode != NULL)
4619 {
4620 jam();
4621 (this->*(treeNodePtr.p->m_info->m_dumpNode))
4622 (requestPtr, treeNodePtr);
4623 }
4624 }
4625 }
4626
4627 g_eventLogger->info("DBSPJ %u : Finished dumping request (%u:%p)",
4628 instance(),
4629 requestPtr.i,
4630 requestPtr.p);
4631 }
4632
getBufferedRow(const Ptr<TreeNode> treeNodePtr,Uint32 rowId,RowPtr * row)4633 void Dbspj::getBufferedRow(const Ptr<TreeNode> treeNodePtr, Uint32 rowId,
4634 RowPtr *row)
4635 {
4636 DEBUG("getBufferedRow, node no: " << treeNodePtr.p->m_node_no
4637 << ", rowId: " << rowId);
4638 ndbassert(treeNodePtr.p->m_bits & TreeNode::T_BUFFER_ANY);
4639
4640 // Set up RowPtr & RowRef for this parent row
4641 RowRef ref;
4642 ndbassert(treeNodePtr.p->m_rows.m_type == RowCollection::COLLECTION_MAP);
4643 treeNodePtr.p->m_rows.m_map.copyto(ref);
4644 const Uint32* const mapptr = get_row_ptr(ref);
4645
4646 // Relocate parent row from correlation value.
4647 treeNodePtr.p->m_rows.m_map.load(mapptr, rowId, ref);
4648 const Uint32* const rowptr = get_row_ptr(ref);
4649
4650 RowPtr _row;
4651 _row.m_src_node_ptrI = treeNodePtr.i;
4652 setupRowPtr(treeNodePtr, _row, ref, rowptr);
4653
4654 getCorrelationData(_row.m_row_data.m_linear,
4655 _row.m_row_data.m_linear.m_header->m_len - 1,
4656 _row.m_src_correlation);
4657 *row = _row;
4658 }
4659
4660 /**
4661 * resumeBufferedNode() - Resume the execution from the specified TreeNode
4662 *
4663 * All preceeding node which we depends on, has completed their
4664 * batches. The returned result rows from our parent node has
4665 * been buffered, and the match-bitmap in our scanAncestor(s)
4666 * are set up.
4667 *
4668 * Iterate through all our buffered parent result rows, check their
4669 * 'match' vs the dependencies, and submit request for the
4670 * qualifying rows.
4671 */
4672 void
resumeBufferedNode(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)4673 Dbspj::resumeBufferedNode(Signal* signal,
4674 Ptr<Request> requestPtr,
4675 Ptr<TreeNode> treeNodePtr)
4676 {
4677 Ptr<TreeNode> parentPtr;
4678 m_treenode_pool.getPtr(parentPtr, treeNodePtr.p->m_parentPtrI);
4679 ndbassert(treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_NODE);
4680 ndbassert(parentPtr.p->m_bits & TreeNode::T_BUFFER_ROW);
4681
4682 int total = 0, skipped = 0;
4683 RowIterator iter;
4684 for (first(parentPtr.p->m_rows, iter); !iter.isNull(); next(iter))
4685 {
4686 RowPtr parentRow;
4687 jam();
4688 total++;
4689
4690 parentRow.m_src_node_ptrI = treeNodePtr.p->m_parentPtrI;
4691 setupRowPtr(parentPtr, parentRow,
4692 iter.m_base.m_ref, iter.m_base.m_row_ptr);
4693
4694 getCorrelationData(parentRow.m_row_data.m_linear,
4695 parentRow.m_row_data.m_linear.m_header->m_len - 1,
4696 parentRow.m_src_correlation);
4697
4698 // Need to consult the Scan-ancestor(s) to determine if
4699 // INNER_JOIN matches were found for all of our predecessors
4700 Ptr<TreeNode> scanAncestorPtr(parentPtr);
4701 RowPtr scanAncestorRow(parentRow);
4702 if (treeNodePtr.p->m_parentPtrI != treeNodePtr.p->m_scanAncestorPtrI)
4703 {
4704 jam();
4705 m_treenode_pool.getPtr(scanAncestorPtr, treeNodePtr.p->m_scanAncestorPtrI);
4706 getBufferedRow(scanAncestorPtr, (parentRow.m_src_correlation >> 16),
4707 &scanAncestorRow);
4708 }
4709
4710 while (true)
4711 {
4712 TreeNodeBitMask required_matches(treeNodePtr.p->m_dependencies);
4713 required_matches.bitAND(scanAncestorPtr.p->m_coverage);
4714
4715 if (!scanAncestorRow.m_matched->contains(required_matches))
4716 {
4717 DEBUG("parentRow-join SKIPPED");
4718 skipped++;
4719 break;
4720 }
4721
4722 if (scanAncestorPtr.p->m_coverage.contains(treeNodePtr.p->m_dependencies))
4723 {
4724 jam();
4725 goto row_accepted;
4726 }
4727
4728 // Has to consult grand-ancestors to verify their matches.
4729 m_treenode_pool.getPtr(scanAncestorPtr, scanAncestorPtr.p->m_scanAncestorPtrI);
4730
4731 if ((scanAncestorPtr.p->m_bits & TreeNode::T_BUFFER_MATCH) == 0)
4732 {
4733 jam();
4734 goto row_accepted;
4735 }
4736
4737 getBufferedRow(scanAncestorPtr, (scanAncestorRow.m_src_correlation >> 16),
4738 &scanAncestorRow);
4739 }
4740 continue; //Row skipped, didn't 'match' dependent INNER-join -> next row
4741
4742 row_accepted:
4743 ndbassert(treeNodePtr.p->m_info != NULL);
4744 ndbassert(treeNodePtr.p->m_info->m_parent_row != NULL);
4745 (this->*(treeNodePtr.p->m_info->m_parent_row))(signal, requestPtr, treeNodePtr, parentRow);
4746 }
4747
4748 DEBUG("resumeBufferedNode: #buffered rows: " << total << ", skipped: " << skipped);
4749 }
4750
4751 /**
4752 * END - MODULE GENERIC
4753 */
4754
4755 void
common_execTRANSID_AI(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const RowPtr & rowRef)4756 Dbspj::common_execTRANSID_AI(Signal* signal,
4757 Ptr<Request> requestPtr,
4758 Ptr<TreeNode> treeNodePtr,
4759 const RowPtr & rowRef)
4760 {
4761 if (likely((requestPtr.p->m_state & Request::RS_ABORTING) == 0))
4762 {
4763 // Set 'matched' bit in previous scan ancestors
4764 if ((requestPtr.p->m_bits & Request::RT_MULTI_SCAN) != 0)
4765 {
4766 RowPtr scanAncestorRow(rowRef);
4767 Uint32 scanAncestorPtrI = treeNodePtr.p->m_scanAncestorPtrI;
4768 while (scanAncestorPtrI != RNIL) // or 'break' below
4769 {
4770 jam();
4771 Ptr<TreeNode> scanAncestorPtr;
4772 m_treenode_pool.getPtr(scanAncestorPtr, scanAncestorPtrI);
4773 if ((scanAncestorPtr.p->m_bits & TreeNode::T_BUFFER_MATCH) == 0)
4774 {
4775 jam();
4776 break;
4777 }
4778
4779 getBufferedRow(scanAncestorPtr, (scanAncestorRow.m_src_correlation >> 16),
4780 &scanAncestorRow);
4781
4782 if (scanAncestorRow.m_matched->get(treeNodePtr.p->m_node_no))
4783 {
4784 jam();
4785 break;
4786 }
4787 scanAncestorRow.m_matched->set(treeNodePtr.p->m_node_no);
4788 scanAncestorPtrI = scanAncestorPtr.p->m_scanAncestorPtrI;
4789 } //while
4790 } //RT_MULTI_SCAN
4791
4792 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
4793 Local_dependency_map nextExec(pool, treeNodePtr.p->m_next_nodes);
4794 Dependency_map::ConstDataBufferIterator it;
4795
4796 /**
4797 * Activate 'next' operations in two steps:
4798 * 1) Any child operations being 'ENQUEUED' are prepared
4799 * for later resumed exec by appending rowRefs to the deferred
4800 * list.
4801 * 2) Start immediate executing non-ENQUEUED child operations.
4802 */
4803 for (nextExec.first(it); !it.isNull(); nextExec.next(it))
4804 {
4805 Ptr<TreeNode> nextTreeNodePtr;
4806 m_treenode_pool.getPtr(nextTreeNodePtr, * it.data);
4807
4808 if (nextTreeNodePtr.p->m_resumeEvents & TreeNode::TN_ENQUEUE_OP)
4809 {
4810 jam();
4811 DEBUG("ENQUEUE row for deferred TreeNode: " << nextTreeNodePtr.p->m_node_no);
4812
4813 /**
4814 * 'rowRef' is the ancestor row from the immediate ancestor in
4815 * the execution plan. In case this is different from the parent-treeNode
4816 * in the 'query', we have to find the 'real' parentRow from the
4817 * parent as defined in the 'query'
4818 */
4819 RowPtr parentRow(rowRef);
4820 if (nextTreeNodePtr.p->m_parentPtrI != treeNodePtr.i)
4821 {
4822 Ptr<TreeNode> parentPtr;
4823 const Uint32 parentRowId = (parentRow.m_src_correlation >> 16);
4824 m_treenode_pool.getPtr(parentPtr, nextTreeNodePtr.p->m_parentPtrI);
4825 getBufferedRow(parentPtr, parentRowId, &parentRow);
4826 }
4827
4828 /**
4829 * Append correlation values of deferred operations
4830 * to a list / fifo. Upon resume, we will then be able to
4831 * relocate all BUFFER'ed parent rows for which to resume operations.
4832 */
4833 bool appended;
4834 {
4835 // Need an own scope for correlation_list, as ::lookup_abort() will also
4836 // construct such a list. Such nested usage is not allowed.
4837 LocalArenaPool<DataBufferSegment<14> > pool(nextTreeNodePtr.p->m_batchArena, m_dependency_map_pool);
4838 Local_correlation_list correlations(pool, nextTreeNodePtr.p->m_deferred.m_correlations);
4839 appended = correlations.append(&parentRow.m_src_correlation, 1);
4840 }
4841 if (unlikely(!appended))
4842 {
4843 jam();
4844 abort(signal, requestPtr, DbspjErr::OutOfQueryMemory);
4845 return;
4846 }
4847
4848 // As there are pending deferred operations we are not complete
4849 requestPtr.p->m_completed_tree_nodes.clear(nextTreeNodePtr.p->m_node_no);
4850 } //TN_ENQUEUE_OP
4851 }
4852
4853 for (nextExec.first(it); !it.isNull(); nextExec.next(it))
4854 {
4855 Ptr<TreeNode> nextTreeNodePtr;
4856 m_treenode_pool.getPtr(nextTreeNodePtr, * it.data);
4857
4858 /**
4859 * Execution of 'next' TreeNode may have to be delayed. Will be resumed
4860 * later, either by lookup_resume() or resumeBufferedNode()
4861 */
4862 static const Uint32 delayExec = TreeNode::TN_ENQUEUE_OP
4863 | TreeNode::TN_EXEC_WAIT;
4864
4865 if ((nextTreeNodePtr.p->m_resumeEvents & delayExec) == 0)
4866 {
4867 jam();
4868
4869 /**
4870 * 'rowRef' is the ancestor row from the immediate ancestor in
4871 * the execution plan. In case this is different from the parent-treeNode
4872 * in the 'query', we have to find the 'real' parentRow from the
4873 * parent as defined in the 'query'
4874 */
4875 RowPtr parentRow(rowRef);
4876 if (nextTreeNodePtr.p->m_parentPtrI != treeNodePtr.i)
4877 {
4878 Ptr<TreeNode> parentPtr;
4879 const Uint32 parentRowId = (parentRow.m_src_correlation >> 16);
4880 m_treenode_pool.getPtr(parentPtr, nextTreeNodePtr.p->m_parentPtrI);
4881 getBufferedRow(parentPtr, parentRowId, &parentRow);
4882 }
4883
4884 ndbassert(nextTreeNodePtr.p->m_info != NULL);
4885 ndbassert(nextTreeNodePtr.p->m_info->m_parent_row != NULL);
4886
4887 (this->*(nextTreeNodePtr.p->m_info->m_parent_row))(signal,
4888 requestPtr, nextTreeNodePtr, parentRow);
4889
4890 /* Recheck RS_ABORTING as 'next' operation might have aborted */
4891 if (unlikely(requestPtr.p->m_state & Request::RS_ABORTING))
4892 {
4893 jam();
4894 return;
4895 }
4896 }
4897 }
4898 }
4899 }
4900
4901
4902 /**
4903 * MODULE LOOKUP
4904 */
4905 const Dbspj::OpInfo
4906 Dbspj::g_LookupOpInfo =
4907 {
4908 &Dbspj::lookup_build,
4909 0, // prepare
4910 &Dbspj::lookup_start,
4911 &Dbspj::lookup_countSignal,
4912 &Dbspj::lookup_execLQHKEYREF,
4913 &Dbspj::lookup_execLQHKEYCONF,
4914 0, // execSCAN_FRAGREF
4915 0, // execSCAN_FRAGCONF
4916 &Dbspj::lookup_parent_row,
4917 0, // Dbspj::lookup_parent_batch_complete,
4918 0, // Dbspj::lookup_parent_batch_repeat,
4919 0, // Dbspj::lookup_parent_batch_cleanup,
4920 0, // Dbspj::lookup_execSCAN_NEXTREQ
4921 0, // Dbspj::lookup_complete
4922 &Dbspj::lookup_abort,
4923 &Dbspj::lookup_execNODE_FAILREP,
4924 &Dbspj::lookup_cleanup,
4925 &Dbspj::lookup_checkNode,
4926 &Dbspj::lookup_dumpNode
4927 };
4928
4929 Uint32
lookup_build(Build_context & ctx,Ptr<Request> requestPtr,const QueryNode * qn,const QueryNodeParameters * qp)4930 Dbspj::lookup_build(Build_context& ctx,
4931 Ptr<Request> requestPtr,
4932 const QueryNode* qn,
4933 const QueryNodeParameters* qp)
4934 {
4935 Uint32 err = 0;
4936 Ptr<TreeNode> treeNodePtr;
4937 const QN_LookupNode * node = (const QN_LookupNode*)qn;
4938 const QN_LookupParameters * param = (const QN_LookupParameters*)qp;
4939 do
4940 {
4941 jam();
4942 err = DbspjErr::InvalidTreeNodeSpecification;
4943 if (unlikely(node->len < QN_LookupNode::NodeSize))
4944 {
4945 jam();
4946 break;
4947 }
4948
4949 err = DbspjErr::InvalidTreeParametersSpecification;
4950 DEBUG("param len: " << param->len);
4951 if (unlikely(param->len < QN_LookupParameters::NodeSize))
4952 {
4953 jam();
4954 break;
4955 }
4956
4957 err = createNode(ctx, requestPtr, treeNodePtr);
4958 if (unlikely(err != 0))
4959 {
4960 jam();
4961 break;
4962 }
4963
4964 treeNodePtr.p->m_tableOrIndexId = node->tableId;
4965 treeNodePtr.p->m_primaryTableId = node->tableId;
4966 treeNodePtr.p->m_schemaVersion = node->tableVersion;
4967 treeNodePtr.p->m_info = &g_LookupOpInfo;
4968 Uint32 transId1 = requestPtr.p->m_transId[0];
4969 Uint32 transId2 = requestPtr.p->m_transId[1];
4970 Uint32 savePointId = ctx.m_savepointId;
4971
4972 Uint32 treeBits = node->requestInfo;
4973 Uint32 paramBits = param->requestInfo;
4974 //ndbout_c("Dbspj::lookup_build() treeBits=%.8x paramBits=%.8x",
4975 // treeBits, paramBits);
4976 LqhKeyReq* dst = (LqhKeyReq*)treeNodePtr.p->m_lookup_data.m_lqhKeyReq;
4977 {
4978 /**
4979 * static variables
4980 */
4981 dst->tcBlockref = reference();
4982 dst->clientConnectPtr = treeNodePtr.i;
4983
4984 /**
4985 * TODO reference()+treeNodePtr.i is passed twice
4986 * this can likely be optimized using the requestInfo-bits
4987 * UPDATE: This can be accomplished by *not* setApplicationAddressFlag
4988 * and patch LQH to then instead use tcBlockref/clientConnectPtr
4989 */
4990 dst->transId1 = transId1;
4991 dst->transId2 = transId2;
4992 dst->savePointId = savePointId;
4993 dst->scanInfo = 0;
4994 dst->attrLen = 0;
4995 /** Initialy set reply ref to client, do_send will set SPJ refs if non-LEAF */
4996 dst->variableData[0] = ctx.m_resultRef;
4997 dst->variableData[1] = param->resultData;
4998 Uint32 requestInfo = 0;
4999 LqhKeyReq::setOperation(requestInfo, ZREAD);
5000 LqhKeyReq::setApplicationAddressFlag(requestInfo, 1);
5001 LqhKeyReq::setDirtyFlag(requestInfo, 1);
5002 LqhKeyReq::setSimpleFlag(requestInfo, 1);
5003 LqhKeyReq::setNormalProtocolFlag(requestInfo, 0); // Assume T_LEAF
5004 LqhKeyReq::setCorrFactorFlag(requestInfo, 1);
5005 LqhKeyReq::setNoDiskFlag(requestInfo,
5006 (treeBits & DABits::NI_LINKED_DISK) == 0 &&
5007 (paramBits & DABits::PI_DISK_ATTR) == 0);
5008
5009 // FirstMatch in a lookup request can just be ignored
5010 //if (treeBits & DABits::NI_FIRST_MATCH)
5011 //{}
5012
5013 dst->requestInfo = requestInfo;
5014 }
5015
5016 if (treeBits & QN_LookupNode::L_UNIQUE_INDEX)
5017 {
5018 jam();
5019 treeNodePtr.p->m_bits |= TreeNode::T_UNIQUE_INDEX_LOOKUP;
5020 }
5021
5022 Uint32 tableId = node->tableId;
5023 Uint32 schemaVersion = node->tableVersion;
5024
5025 Uint32 tableSchemaVersion = tableId + ((schemaVersion << 16) & 0xFFFF0000);
5026 dst->tableSchemaVersion = tableSchemaVersion;
5027
5028 ctx.m_resultData = param->resultData;
5029 treeNodePtr.p->m_lookup_data.m_api_resultRef = ctx.m_resultRef;
5030 treeNodePtr.p->m_lookup_data.m_api_resultData = param->resultData;
5031 treeNodePtr.p->m_lookup_data.m_outstanding = 0;
5032
5033 /**
5034 * Parse stuff common lookup/scan-frag
5035 */
5036 struct DABuffer nodeDA, paramDA;
5037 nodeDA.ptr = node->optional;
5038 nodeDA.end = nodeDA.ptr + (node->len - QN_LookupNode::NodeSize);
5039 paramDA.ptr = param->optional;
5040 paramDA.end = paramDA.ptr + (param->len - QN_LookupParameters::NodeSize);
5041 err = parseDA(ctx, requestPtr, treeNodePtr,
5042 nodeDA, treeBits, paramDA, paramBits);
5043 if (unlikely(err != 0))
5044 {
5045 jam();
5046 break;
5047 }
5048
5049 if (treeNodePtr.p->m_bits & TreeNode::T_ATTR_INTERPRETED)
5050 {
5051 jam();
5052 LqhKeyReq::setInterpretedFlag(dst->requestInfo, 1);
5053 }
5054
5055 /**
5056 * Inherit batch size from parent
5057 */
5058 treeNodePtr.p->m_batch_size = 1;
5059 if (treeNodePtr.p->m_parentPtrI != RNIL)
5060 {
5061 jam();
5062 Ptr<TreeNode> parentPtr;
5063 m_treenode_pool.getPtr(parentPtr, treeNodePtr.p->m_parentPtrI);
5064 treeNodePtr.p->m_batch_size = parentPtr.p->m_batch_size;
5065 }
5066
5067 if (ctx.m_start_signal)
5068 {
5069 jam();
5070 Signal * signal = ctx.m_start_signal;
5071 const LqhKeyReq* src = (const LqhKeyReq*)signal->getDataPtr();
5072 #ifdef NOT_YET
5073 Uint32 instanceNo =
5074 blockToInstance(signal->header.theReceiversBlockNumber);
5075 treeNodePtr.p->m_send.m_ref = numberToRef(DBLQH,
5076 instanceNo, getOwnNodeId());
5077 #else
5078 treeNodePtr.p->m_send.m_ref =
5079 numberToRef(DBLQH, getInstanceKey(src->tableSchemaVersion & 0xFFFF,
5080 src->fragmentData & 0xFFFF),
5081 getOwnNodeId());
5082 #endif
5083
5084 Uint32 hashValue = src->hashValue;
5085 Uint32 fragId = src->fragmentData;
5086 Uint32 attrLen = src->attrLen; // fragdist-key is in here
5087
5088 /**
5089 * assertions
5090 */
5091 #ifdef VM_TRACE
5092 Uint32 requestInfo = src->requestInfo;
5093 ndbassert(LqhKeyReq::getAttrLen(attrLen) == 0); // Only long
5094 ndbassert(LqhKeyReq::getScanTakeOverFlag(attrLen) == 0);// Not supported
5095 ndbassert(LqhKeyReq::getReorgFlag(attrLen) == ScanFragReq::REORG_ALL); // Not supported
5096 ndbassert(LqhKeyReq::getOperation(requestInfo) == ZREAD);
5097 ndbassert(LqhKeyReq::getKeyLen(requestInfo) == 0); // Only long
5098 ndbassert(LqhKeyReq::getMarkerFlag(requestInfo) == 0); // Only read
5099 ndbassert(LqhKeyReq::getAIInLqhKeyReq(requestInfo) == 0);
5100 ndbassert(LqhKeyReq::getSeqNoReplica(requestInfo) == 0);
5101 ndbassert(LqhKeyReq::getLastReplicaNo(requestInfo) == 0);
5102 ndbassert(LqhKeyReq::getApplicationAddressFlag(requestInfo) != 0);
5103 ndbassert(LqhKeyReq::getSameClientAndTcFlag(requestInfo) == 0);
5104 #endif
5105
5106 #ifdef TODO
5107 /**
5108 * Handle various lock-modes
5109 */
5110 static Uint8 getDirtyFlag(const UintR & requestInfo);
5111 static Uint8 getSimpleFlag(const UintR & requestInfo);
5112 #endif
5113
5114 #ifdef VM_TRACE
5115 Uint32 dst_requestInfo = dst->requestInfo;
5116 ndbassert(LqhKeyReq::getInterpretedFlag(requestInfo) ==
5117 LqhKeyReq::getInterpretedFlag(dst_requestInfo));
5118 ndbassert(LqhKeyReq::getNoDiskFlag(requestInfo) ==
5119 LqhKeyReq::getNoDiskFlag(dst_requestInfo));
5120 #endif
5121
5122 dst->hashValue = hashValue;
5123 dst->fragmentData = fragId;
5124 dst->attrLen = attrLen; // fragdist is in here
5125
5126 treeNodePtr.p->m_bits |= TreeNode::T_ONE_SHOT;
5127 }
5128 return 0;
5129 } while (0);
5130
5131 return err;
5132 }
5133
5134 void
lookup_start(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5135 Dbspj::lookup_start(Signal* signal,
5136 Ptr<Request> requestPtr,
5137 Ptr<TreeNode> treeNodePtr)
5138 {
5139 lookup_send(signal, requestPtr, treeNodePtr);
5140 }
5141
5142 void
lookup_send(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5143 Dbspj::lookup_send(Signal* signal,
5144 Ptr<Request> requestPtr,
5145 Ptr<TreeNode> treeNodePtr)
5146 {
5147 jam();
5148 if (!ERROR_INSERTED(17521)) // Avoid emulated rnd errors
5149 {
5150 // ::checkTableError() should be handled before we reach this far
5151 ndbassert(checkTableError(treeNodePtr) == 0);
5152 }
5153
5154 /**
5155 * Count number of expected reply signals:
5156 * CONF or REF reply:
5157 * - Expected by every non-leaf TreeNodes
5158 * - For a scan request evel leaf TreeNodes get a CONF/REF reply.
5159 *
5160 * TRANSID_AI reply:
5161 * - Expected for all TreeNodes having T_EXPECT_TRANSID_AI
5162 */
5163 Uint32 cnt = 0;
5164
5165 if (requestPtr.p->isScan() || !treeNodePtr.p->isLeaf()) //CONF/REF
5166 cnt++;
5167
5168 if (treeNodePtr.p->m_bits & TreeNode::T_EXPECT_TRANSID_AI) //TRANSID_AI
5169 cnt++;
5170
5171 LqhKeyReq* req = reinterpret_cast<LqhKeyReq*>(signal->getDataPtrSend());
5172
5173 memcpy(req, treeNodePtr.p->m_lookup_data.m_lqhKeyReq,
5174 sizeof(treeNodePtr.p->m_lookup_data.m_lqhKeyReq));
5175 req->variableData[2] = treeNodePtr.p->m_send.m_correlation;
5176 req->variableData[3] = requestPtr.p->m_rootResultData;
5177
5178 if (!treeNodePtr.p->isLeaf() || requestPtr.p->isScan())
5179 {
5180 // Non-LEAF want reply to SPJ instead of ApiClient.
5181 LqhKeyReq::setNormalProtocolFlag(req->requestInfo, 1);
5182 req->variableData[0] = reference();
5183 req->variableData[1] = treeNodePtr.i;
5184 }
5185 else
5186 {
5187 jam();
5188 /**
5189 * Fake that TC sent this request,
5190 * so that it can route a maybe TCKEYREF
5191 */
5192 req->tcBlockref = requestPtr.p->m_senderRef;
5193 }
5194
5195 SectionHandle handle(this);
5196
5197 Uint32 ref = treeNodePtr.p->m_send.m_ref;
5198 Uint32 keyInfoPtrI = treeNodePtr.p->m_send.m_keyInfoPtrI;
5199 Uint32 attrInfoPtrI = treeNodePtr.p->m_send.m_attrInfoPtrI;
5200
5201 Uint32 err = 0;
5202
5203 do
5204 {
5205 if (treeNodePtr.p->m_bits & TreeNode::T_ONE_SHOT)
5206 {
5207 jam();
5208 /**
5209 * Pass sections to send
5210 */
5211 treeNodePtr.p->m_send.m_attrInfoPtrI = RNIL;
5212 treeNodePtr.p->m_send.m_keyInfoPtrI = RNIL;
5213 }
5214 else
5215 {
5216 if ((treeNodePtr.p->m_bits & TreeNode::T_KEYINFO_CONSTRUCTED) == 0)
5217 {
5218 jam();
5219 Uint32 tmp = RNIL;
5220 if (!dupSection(tmp, keyInfoPtrI))
5221 {
5222 jam();
5223 ndbassert(tmp == RNIL); // Guard for memleak
5224 err = DbspjErr::OutOfSectionMemory;
5225 break;
5226 }
5227
5228 keyInfoPtrI = tmp;
5229 }
5230 else
5231 {
5232 jam();
5233 treeNodePtr.p->m_send.m_keyInfoPtrI = RNIL;
5234 }
5235
5236 if ((treeNodePtr.p->m_bits & TreeNode::T_ATTRINFO_CONSTRUCTED) == 0)
5237 {
5238 jam();
5239 Uint32 tmp = RNIL;
5240
5241 /**
5242 * Test execution terminated due to 'OutOfSectionMemory' which
5243 * may happen for different treeNodes in the request:
5244 * - 17070: Fail on any lookup_send()
5245 * - 17071: Fail on lookup_send() if 'isLeaf'
5246 * - 17072: Fail on lookup_send() if treeNode not root
5247 */
5248 if (ERROR_INSERTED(17070) ||
5249 (ERROR_INSERTED(17071) && treeNodePtr.p->isLeaf()) ||
5250 (ERROR_INSERTED(17072) && treeNodePtr.p->m_parentPtrI != RNIL))
5251 {
5252 jam();
5253 CLEAR_ERROR_INSERT_VALUE;
5254 ndbout_c("Injecting OutOfSectionMemory error at line %d file %s",
5255 __LINE__, __FILE__);
5256 releaseSection(keyInfoPtrI);
5257 err = DbspjErr::OutOfSectionMemory;
5258 break;
5259 }
5260
5261 if (!dupSection(tmp, attrInfoPtrI))
5262 {
5263 jam();
5264 ndbassert(tmp == RNIL); // Guard for memleak
5265 releaseSection(keyInfoPtrI);
5266 err = DbspjErr::OutOfSectionMemory;
5267 break;
5268 }
5269
5270 attrInfoPtrI = tmp;
5271 }
5272 else
5273 {
5274 jam();
5275 treeNodePtr.p->m_send.m_attrInfoPtrI = RNIL;
5276 }
5277 }
5278
5279 getSection(handle.m_ptr[0], keyInfoPtrI);
5280 getSection(handle.m_ptr[1], attrInfoPtrI);
5281 handle.m_cnt = 2;
5282
5283 /**
5284 * Inject error to test LQHKEYREF handling:
5285 * Tampering with tableSchemaVersion such that LQH will
5286 * return LQHKEYREF('1227: Invalid schema version')
5287 * May happen for different treeNodes in the request:
5288 * - 17030: Fail on any lookup_send()
5289 * - 17031: Fail on lookup_send() if 'isLeaf'
5290 * - 17032: Fail on lookup_send() if treeNode not root
5291 */
5292 if (ERROR_INSERTED(17030) ||
5293 (ERROR_INSERTED(17031) && treeNodePtr.p->isLeaf()) ||
5294 (ERROR_INSERTED(17032) && treeNodePtr.p->m_parentPtrI != RNIL))
5295 {
5296 jam();
5297 CLEAR_ERROR_INSERT_VALUE;
5298 req->tableSchemaVersion += (1 << 16); // Provoke 'Invalid schema version'
5299 }
5300
5301 #if defined DEBUG_LQHKEYREQ
5302 ndbout_c("LQHKEYREQ to %x", ref);
5303 printLQHKEYREQ(stdout, signal->getDataPtrSend(),
5304 NDB_ARRAY_SIZE(treeNodePtr.p->m_lookup_data.m_lqhKeyReq),
5305 DBLQH);
5306 printf("KEYINFO: ");
5307 print(handle.m_ptr[0], stdout);
5308 printf("ATTRINFO: ");
5309 print(handle.m_ptr[1], stdout);
5310 #endif
5311
5312 Uint32 Tnode = refToNode(ref);
5313 if (Tnode == getOwnNodeId())
5314 {
5315 c_Counters.incr_counter(CI_LOCAL_READS_SENT, 1);
5316 }
5317 else
5318 {
5319 ndbrequire(!ERROR_INSERTED(17014));
5320
5321 c_Counters.incr_counter(CI_REMOTE_READS_SENT, 1);
5322 }
5323
5324 /**
5325 * Test correct abort handling if datanode not (yet)
5326 * connected to requesting API node.
5327 */
5328 if (ERROR_INSERTED(17530) &&
5329 !getNodeInfo(getResultRef(requestPtr)).m_connected)
5330 {
5331 jam();
5332 releaseSections(handle);
5333 err = DbspjErr::OutOfSectionMemory; //Fake an error likely seen here
5334 break;
5335 }
5336
5337 /**
5338 * Test execution terminated due to 'NodeFailure' which
5339 * may happen for different treeNodes in the request:
5340 * - 17020: Fail on any lookup_send()
5341 * - 17021: Fail on lookup_send() if 'isLeaf'
5342 * - 17022: Fail on lookup_send() if treeNode not root
5343 */
5344 if (ERROR_INSERTED(17020) ||
5345 (ERROR_INSERTED(17021) && treeNodePtr.p->isLeaf()) ||
5346 (ERROR_INSERTED(17022) && treeNodePtr.p->m_parentPtrI != RNIL))
5347 {
5348 jam();
5349 CLEAR_ERROR_INSERT_VALUE;
5350 releaseSections(handle);
5351 err = DbspjErr::NodeFailure;
5352 break;
5353 }
5354
5355 if (unlikely(!c_alive_nodes.get(Tnode)))
5356 {
5357 jam();
5358 releaseSections(handle);
5359 err = DbspjErr::NodeFailure;
5360 break;
5361 }
5362 else if (cnt > 0)
5363 {
5364 // Register signal 'cnt' required before completion
5365 jam();
5366 ndbassert(Tnode < NDB_ARRAY_SIZE(requestPtr.p->m_lookup_node_data));
5367 requestPtr.p->m_completed_tree_nodes.clear(treeNodePtr.p->m_node_no);
5368 requestPtr.p->m_outstanding += cnt;
5369 requestPtr.p->m_lookup_node_data[Tnode] += cnt;
5370 // number wrapped
5371 ndbrequire(requestPtr.p->m_lookup_node_data[Tnode] != 0);
5372 }
5373
5374 sendSignal(ref, GSN_LQHKEYREQ, signal,
5375 NDB_ARRAY_SIZE(treeNodePtr.p->m_lookup_data.m_lqhKeyReq),
5376 JBB, &handle);
5377
5378 treeNodePtr.p->m_lookup_data.m_outstanding += cnt;
5379 if (requestPtr.p->isLookup() && treeNodePtr.p->isLeaf())
5380 {
5381 jam();
5382 /**
5383 * Send TCKEYCONF with DirtyReadBit + Tnode,
5384 * so that API can discover if Tnode died while waiting for result
5385 */
5386 lookup_sendLeafCONF(signal, requestPtr, treeNodePtr, Tnode);
5387 }
5388 return;
5389 }
5390 while (0);
5391
5392 ndbrequire(err);
5393 jam();
5394 abort(signal, requestPtr, err);
5395 } //Dbspj::lookup_send
5396
5397 void
lookup_countSignal(const Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 cnt)5398 Dbspj::lookup_countSignal(const Signal* signal,
5399 Ptr<Request> requestPtr,
5400 Ptr<TreeNode> treeNodePtr,
5401 Uint32 cnt)
5402 {
5403 jam();
5404 const Uint32 Tnode = refToNode(signal->getSendersBlockRef());
5405
5406 ndbassert(requestPtr.p->m_lookup_node_data[Tnode] >= cnt);
5407 requestPtr.p->m_lookup_node_data[Tnode] -= cnt;
5408
5409 ndbassert(requestPtr.p->m_outstanding >= cnt);
5410 requestPtr.p->m_outstanding -= cnt;
5411
5412 ndbassert(treeNodePtr.p->m_lookup_data.m_outstanding >= cnt);
5413 treeNodePtr.p->m_lookup_data.m_outstanding -= cnt;
5414
5415 if (treeNodePtr.p->m_lookup_data.m_outstanding == 0 &&
5416 treeNodePtr.p->m_deferred.isEmpty())
5417 {
5418 jam();
5419 // We have received all rows for this treeNode in this batch.
5420 requestPtr.p->m_completed_tree_nodes.set(treeNodePtr.p->m_node_no);
5421 }
5422 }
5423
5424 void
lookup_execLQHKEYREF(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5425 Dbspj::lookup_execLQHKEYREF(Signal* signal,
5426 Ptr<Request> requestPtr,
5427 Ptr<TreeNode> treeNodePtr)
5428 {
5429 jam();
5430 const LqhKeyRef * rep = (LqhKeyRef*)signal->getDataPtr();
5431 const Uint32 errCode = rep->errorCode;
5432
5433 c_Counters.incr_counter(CI_READS_NOT_FOUND, 1);
5434
5435 DEBUG("lookup_execLQHKEYREF, errorCode:" << errCode);
5436
5437 if (treeNodePtr.p->m_bits & TreeNode::T_EXPECT_TRANSID_AI)
5438 {
5439 // Count(==2) the REF and the non-arriving TRANSID_AI
5440 lookup_countSignal(signal, requestPtr, treeNodePtr, 2);
5441 }
5442 else
5443 {
5444 // Count(==1) only awaiting CONF/REF
5445 lookup_countSignal(signal, requestPtr, treeNodePtr, 1);
5446 }
5447
5448 /**
5449 * If Request is still actively running: API need to
5450 * be informed about error.
5451 * Error code may either indicate a 'hard error' which should
5452 * terminate the query execution, or a 'soft error' which
5453 * should be signaled NDBAPI, and execution continued.
5454 */
5455 if (likely((requestPtr.p->m_state & Request::RS_ABORTING) == 0))
5456 {
5457 switch(errCode){
5458 case 626: // 'Soft error' : Row not found
5459 case 899: // 'Soft error' : Interpreter_exit_nok
5460
5461 jam();
5462 /**
5463 * Only Lookup-request need to send TCKEYREF...
5464 */
5465 if (requestPtr.p->isLookup())
5466 {
5467 jam();
5468 lookup_stop_branch(signal, requestPtr, treeNodePtr, errCode);
5469 }
5470 break;
5471
5472 default: // 'Hard error' : abort query
5473 jam();
5474 abort(signal, requestPtr, errCode);
5475 return;
5476 }
5477 }
5478
5479 /**
5480 * Another TreeNode awaited for completion of this request
5481 * before it could resume its operation.
5482 */
5483 if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_REF)
5484 {
5485 jam();
5486 ndbassert(treeNodePtr.p->m_resumePtrI != RNIL);
5487 Ptr<TreeNode> resumeTreeNodePtr;
5488 m_treenode_pool.getPtr(resumeTreeNodePtr, treeNodePtr.p->m_resumePtrI);
5489 lookup_resume(signal, requestPtr, resumeTreeNodePtr);
5490 }
5491
5492 if (requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no))
5493 {
5494 jam();
5495 // We have received all rows for this treeNode in this batch.
5496 handleTreeNodeComplete(signal, requestPtr, treeNodePtr);
5497 }
5498 }
5499
5500 /**
5501 * lookup_stop_branch() will send required signals to the API
5502 * to inform that the query branch starting with 'treeNodePtr'
5503 * will not be executed due to 'errCode'.
5504 *
5505 * NOTE: 'errCode' is expected to be a 'soft error', like
5506 * 'row not found', and is *not* intended to abort
5507 * entire query.
5508 */
5509 void
lookup_stop_branch(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 errCode)5510 Dbspj::lookup_stop_branch(Signal* signal,
5511 Ptr<Request> requestPtr,
5512 Ptr<TreeNode> treeNodePtr,
5513 Uint32 errCode)
5514 {
5515 ndbassert(requestPtr.p->isLookup());
5516 DEBUG("::lookup_stop_branch"
5517 << ", node: " << treeNodePtr.p->m_node_no
5518 );
5519
5520 /**
5521 * If this is a "leaf" node, either on its own, or
5522 * indirectly through an unique index lookup:
5523 * Ordinary operation would have emited extra TCKEYCONF
5524 * required for nodefail handling.
5525 * (In case of nodefails during final leaf REQs).
5526 * As API cant, or at least does not try to, tell whether
5527 * leaf operation is REFed by SPJ or LQH, we still have to
5528 * send this extra CONF as required by protocoll.
5529 */
5530 if (treeNodePtr.p->isLeaf())
5531 {
5532 jam();
5533 DEBUG(" Leaf-lookup: sending extra 'CONF' for nodefail handling");
5534 lookup_sendLeafCONF(signal, requestPtr, treeNodePtr, getOwnNodeId());
5535 }
5536
5537 else if (treeNodePtr.p->m_bits & TreeNode::T_UNIQUE_INDEX_LOOKUP)
5538 {
5539 /**
5540 * UNIQUE_INDEX lookups are represented with an additional
5541 * child which does the lookup from UQ-index into the table
5542 * itself. Has to check this child for being 'leaf'.
5543 */
5544 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
5545 Local_dependency_map list(pool, treeNodePtr.p->m_child_nodes);
5546 Dependency_map::ConstDataBufferIterator it;
5547 ndbrequire(list.first(it));
5548 ndbrequire(list.getSize() == 1); // should only be 1 child
5549 Ptr<TreeNode> childPtr;
5550 m_treenode_pool.getPtr(childPtr, * it.data);
5551 if (childPtr.p->isLeaf())
5552 {
5553 jam();
5554 DEBUG(" UNUQUE_INDEX-Leaf-lookup: sending extra 'CONF' "
5555 "for nodefail handling");
5556 lookup_sendLeafCONF(signal, requestPtr, childPtr, getOwnNodeId());
5557 }
5558 }
5559
5560 /**
5561 * Then produce the REF(errCode) which terminates this
5562 * tree branch.
5563 */
5564 const Uint32 resultRef = treeNodePtr.p->m_lookup_data.m_api_resultRef;
5565 const Uint32 resultData = treeNodePtr.p->m_lookup_data.m_api_resultData;
5566 TcKeyRef* ref = (TcKeyRef*)signal->getDataPtr();
5567 ref->connectPtr = resultData;
5568 ref->transId[0] = requestPtr.p->m_transId[0];
5569 ref->transId[1] = requestPtr.p->m_transId[1];
5570 ref->errorCode = errCode;
5571 ref->errorData = 0;
5572
5573 DEBUG(" send TCKEYREF");
5574 sendTCKEYREF(signal, resultRef, requestPtr.p->m_senderRef);
5575 }
5576
5577 /**
5578 * Lookup leafs in lookup requests will not receive CONF/REF
5579 * back to SPJ when LQH request has completed. Instead we
5580 * will cleanup() the request when the last leafnode KEYREQ
5581 * has been sent. If any of the REQuested datanodes fails
5582 * after this, SPJ will not detect this and be able to
5583 * send appropriate signals to the API to awake it from the
5584 * 'wait' state.
5585 * To get around this, we instead send an extra CONF
5586 * to the API which inform it about which 'node' it should
5587 * expect a result from. API can then discover if this
5588 * 'node' died while waiting for results.
5589 */
5590 void
lookup_sendLeafCONF(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 node)5591 Dbspj::lookup_sendLeafCONF(Signal* signal,
5592 Ptr<Request> requestPtr,
5593 Ptr<TreeNode> treeNodePtr,
5594 Uint32 node)
5595 {
5596 ndbassert(treeNodePtr.p->isLeaf());
5597
5598 const Uint32 resultRef = treeNodePtr.p->m_lookup_data.m_api_resultRef;
5599 const Uint32 resultData = treeNodePtr.p->m_lookup_data.m_api_resultData;
5600 TcKeyConf* const conf = (TcKeyConf*)signal->getDataPtr();
5601 conf->apiConnectPtr = RNIL;
5602 conf->confInfo = 0;
5603 conf->gci_hi = 0;
5604 TcKeyConf::setNoOfOperations(conf->confInfo, 1);
5605 conf->transId1 = requestPtr.p->m_transId[0];
5606 conf->transId2 = requestPtr.p->m_transId[1];
5607 conf->operations[0].apiOperationPtr = resultData;
5608 conf->operations[0].attrInfoLen =
5609 TcKeyConf::DirtyReadBit | node;
5610 const Uint32 sigLen = TcKeyConf::StaticLength + TcKeyConf::OperationLength;
5611 sendTCKEYCONF(signal, sigLen, resultRef, requestPtr.p->m_senderRef);
5612 }
5613
5614
5615 void
lookup_execLQHKEYCONF(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5616 Dbspj::lookup_execLQHKEYCONF(Signal* signal,
5617 Ptr<Request> requestPtr,
5618 Ptr<TreeNode> treeNodePtr)
5619 {
5620 ndbrequire(!(requestPtr.p->isLookup() && treeNodePtr.p->isLeaf()));
5621
5622 if (treeNodePtr.p->m_bits & TreeNode::T_USER_PROJECTION)
5623 {
5624 jam();
5625 requestPtr.p->m_rows++;
5626 }
5627
5628 // Count awaiting CONF. If non-leaf, there will also be a TRANSID_AI
5629 lookup_countSignal(signal, requestPtr, treeNodePtr, 1);
5630
5631 /**
5632 * Another TreeNode awaited for completion of this request
5633 * before it could resume its operation.
5634 */
5635 if (treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_CONF)
5636 {
5637 jam();
5638 ndbassert(treeNodePtr.p->m_resumePtrI != RNIL);
5639 Ptr<TreeNode> resumeTreeNodePtr;
5640 m_treenode_pool.getPtr(resumeTreeNodePtr, treeNodePtr.p->m_resumePtrI);
5641 lookup_resume(signal, requestPtr, resumeTreeNodePtr);
5642 }
5643
5644 if (requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no))
5645 {
5646 jam();
5647 // We have received all rows for this treeNode in this batch.
5648 handleTreeNodeComplete(signal, requestPtr, treeNodePtr);
5649 }
5650 }
5651
5652 void
lookup_parent_row(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const RowPtr & rowRef)5653 Dbspj::lookup_parent_row(Signal* signal,
5654 Ptr<Request> requestPtr,
5655 Ptr<TreeNode> treeNodePtr,
5656 const RowPtr & rowRef)
5657 {
5658 jam();
5659
5660 DEBUG("::lookup_parent_row"
5661 << ", node: " << treeNodePtr.p->m_node_no);
5662 lookup_row(signal, requestPtr, treeNodePtr, rowRef);
5663 } // Dbspj::lookup_parent_row()
5664
5665 /**
5666 * lookup_resume() is a delayed lookup_parent_row.
5667 * It will locate the next parent row now allowed to execute,
5668 * and create a child lookup request for that row.
5669 */
5670 void
lookup_resume(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5671 Dbspj::lookup_resume(Signal* signal,
5672 Ptr<Request> requestPtr,
5673 Ptr<TreeNode> treeNodePtr)
5674 {
5675 jam();
5676 DEBUG("::lookup_resume"
5677 << ", node: " << treeNodePtr.p->m_node_no
5678 );
5679
5680 ndbassert(treeNodePtr.p->m_parentPtrI != RNIL);
5681 Ptr<TreeNode> parentPtr;
5682 m_treenode_pool.getPtr(parentPtr, treeNodePtr.p->m_parentPtrI);
5683
5684 if (unlikely(requestPtr.p->m_state & Request::RS_ABORTING))
5685 {
5686 jam();
5687 return;
5688 }
5689 ndbassert(!treeNodePtr.p->m_deferred.isEmpty());
5690 ndbassert(!requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no));
5691
5692 Uint32 corrVal;
5693 {
5694 LocalArenaPool<DataBufferSegment<14> > pool(treeNodePtr.p->m_batchArena, m_dependency_map_pool);
5695 Local_correlation_list correlations(pool, treeNodePtr.p->m_deferred.m_correlations);
5696
5697 Local_correlation_list::DataBufferIterator it;
5698 const bool valid = correlations.position(it, (Uint32)(treeNodePtr.p->m_deferred.m_pos++));
5699 (void)valid; ndbassert(valid);
5700 corrVal = *it.data;
5701 }
5702
5703 // Set up RowPtr & RowRef for this parent row
5704 RowPtr row;
5705 row.m_src_node_ptrI = parentPtr.i;
5706 row.m_src_correlation = corrVal;
5707
5708 ndbassert(parentPtr.p->m_rows.m_type == RowCollection::COLLECTION_MAP);
5709 RowRef ref;
5710 parentPtr.p->m_rows.m_map.copyto(ref);
5711 const Uint32* const mapptr = get_row_ptr(ref);
5712
5713 // Relocate parent row from correlation value.
5714 const Uint32 rowId = (corrVal & 0xFFFF);
5715 parentPtr.p->m_rows.m_map.load(mapptr, rowId, ref);
5716
5717 const Uint32* const rowptr = get_row_ptr(ref);
5718 setupRowPtr(parentPtr, row, ref, rowptr);
5719
5720 lookup_row(signal, requestPtr, treeNodePtr, row);
5721 } // Dbspj::lookup_resume()
5722
5723 void
lookup_row(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const RowPtr & rowRef)5724 Dbspj::lookup_row(Signal* signal,
5725 Ptr<Request> requestPtr,
5726 Ptr<TreeNode> treeNodePtr,
5727 const RowPtr & rowRef)
5728 {
5729 jam();
5730
5731 /**
5732 * Here we need to...
5733 * 1) construct a key
5734 * 2) compute hash (normally TC)
5735 * 3) get node for row (normally TC)
5736 */
5737 Uint32 err = 0;
5738 const Uint32 tableId = treeNodePtr.p->m_tableOrIndexId;
5739 const Uint32 corrVal = rowRef.m_src_correlation;
5740
5741 DEBUG("::lookup_row"
5742 << ", node: " << treeNodePtr.p->m_node_no);
5743
5744 do
5745 {
5746 err = checkTableError(treeNodePtr);
5747 if (unlikely(err != 0))
5748 {
5749 jam();
5750 break;
5751 }
5752
5753 /**
5754 * Test execution terminated due to 'OutOfQueryMemory' which
5755 * may happen multiple places below:
5756 * - 17040: Fail on any lookup_parent_row()
5757 * - 17041: Fail on lookup_parent_row() if 'isLeaf'
5758 * - 17042: Fail on lookup_parent_row() if treeNode not root
5759 * - 17043: Fail after last outstanding signal received.
5760 */
5761 if (ERROR_INSERTED(17040) ||
5762 (ERROR_INSERTED(17041) && treeNodePtr.p->isLeaf()) ||
5763 (ERROR_INSERTED(17042) && treeNodePtr.p->m_parentPtrI != RNIL) ||
5764 (ERROR_INSERTED(17043) && requestPtr.p->m_outstanding == 0))
5765 {
5766 jam();
5767 CLEAR_ERROR_INSERT_VALUE;
5768 err = DbspjErr::OutOfQueryMemory;
5769 break;
5770 }
5771
5772 Uint32 ptrI = RNIL;
5773 if (treeNodePtr.p->m_bits & TreeNode::T_KEYINFO_CONSTRUCTED)
5774 {
5775 jam();
5776 DEBUG("parent_row w/ T_KEYINFO_CONSTRUCTED");
5777 /**
5778 * Get key-pattern
5779 */
5780 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
5781 Local_pattern_store pattern(pool, treeNodePtr.p->m_keyPattern);
5782
5783 bool keyIsNull;
5784 err = expand(ptrI, pattern, rowRef, keyIsNull);
5785 if (unlikely(err != 0))
5786 {
5787 jam();
5788 releaseSection(ptrI);
5789 break;
5790 }
5791
5792 if (keyIsNull)
5793 {
5794 /**
5795 * When the key contains NULL values, an EQ-match is impossible!
5796 * Entire lookup request can therefore be eliminate as it is known
5797 * to be REFused with errorCode = 626 (Row not found).
5798 *
5799 * Scan requests can simply ignore these child LQHKEYREQs
5800 * as REFs are not needed, either by the API protocoll,
5801 * or in order to handle TN_RESUME_REF.
5802 *
5803 * Lookup requests has to send the same KEYREFs as would have
5804 * been produced by LQH.
5805 */
5806 jam();
5807 DEBUG("Key contain NULL values: Ignore impossible KEYREQ");
5808 releaseSection(ptrI);
5809 ptrI = RNIL;
5810
5811 /* count(==0) the not sent signal to update completion status */
5812 lookup_countSignal(signal, requestPtr, treeNodePtr, 0);
5813
5814 /* Send KEYREF(errCode=626) as required by lookup request protocol */
5815 if (requestPtr.p->isLookup())
5816 {
5817 jam();
5818 lookup_stop_branch(signal, requestPtr, treeNodePtr, 626);
5819 }
5820
5821 /**
5822 * Another TreeNode awaited completion of this treeNode
5823 * or sub-branch before it could resume its operation.
5824 */
5825 if ((treeNodePtr.p->m_resumeEvents & TreeNode::TN_RESUME_REF))
5826 {
5827 jam();
5828 DEBUG("handling TN_RESUME_REF");
5829 ndbassert(treeNodePtr.p->m_resumePtrI != RNIL);
5830 Ptr<TreeNode> resumeTreeNodePtr;
5831 m_treenode_pool.getPtr(resumeTreeNodePtr, treeNodePtr.p->m_resumePtrI);
5832 lookup_resume(signal, requestPtr, resumeTreeNodePtr);
5833 }
5834
5835 /**
5836 * This possibly completed this treeNode, handle it.
5837 */
5838 if (requestPtr.p->m_completed_tree_nodes.get(treeNodePtr.p->m_node_no))
5839 {
5840 jam();
5841 handleTreeNodeComplete(signal, requestPtr, treeNodePtr);
5842 }
5843
5844 return; // Bailout, KEYREQ would have returned KEYREF(626) anyway
5845 } // keyIsNull
5846
5847 ndbassert(ptrI != RNIL);
5848 treeNodePtr.p->m_send.m_keyInfoPtrI = ptrI;
5849 } //T_KEYINFO_CONSTRUCTED
5850
5851 BuildKeyReq tmp;
5852 err = computeHash(signal, tmp, tableId, treeNodePtr.p->m_send.m_keyInfoPtrI);
5853 if (unlikely(err != 0))
5854 break;
5855
5856 err = getNodes(signal, tmp, tableId);
5857 if (unlikely(err != 0))
5858 break;
5859
5860 Uint32 attrInfoPtrI = treeNodePtr.p->m_send.m_attrInfoPtrI;
5861 if (treeNodePtr.p->m_bits & TreeNode::T_ATTRINFO_CONSTRUCTED)
5862 {
5863 jam();
5864 Uint32 tmp = RNIL;
5865
5866 /**
5867 * Test execution terminated due to 'OutOfSectionMemory' which
5868 * may happen for different treeNodes in the request:
5869 * - 17080: Fail on lookup_parent_row
5870 * - 17081: Fail on lookup_parent_row: if 'isLeaf'
5871 * - 17082: Fail on lookup_parent_row: if treeNode not root
5872 */
5873 if (ERROR_INSERTED(17080) ||
5874 (ERROR_INSERTED(17081) && treeNodePtr.p->isLeaf()) ||
5875 (ERROR_INSERTED(17082) && treeNodePtr.p->m_parentPtrI != RNIL))
5876 {
5877 jam();
5878 CLEAR_ERROR_INSERT_VALUE;
5879 ndbout_c("Injecting OutOfSectionMemory error at line %d file %s",
5880 __LINE__, __FILE__);
5881 err = DbspjErr::OutOfSectionMemory;
5882 break;
5883 }
5884
5885 if (!dupSection(tmp, attrInfoPtrI))
5886 {
5887 jam();
5888 ndbassert(tmp == RNIL); // Guard for memleak
5889 err = DbspjErr::OutOfSectionMemory;
5890 break;
5891 }
5892
5893 Uint32 org_size;
5894 {
5895 SegmentedSectionPtr ptr;
5896 getSection(ptr, tmp);
5897 org_size = ptr.sz;
5898 }
5899
5900 bool hasNull;
5901 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
5902 Local_pattern_store pattern(pool, treeNodePtr.p->m_attrParamPattern);
5903 err = expand(tmp, pattern, rowRef, hasNull);
5904 if (unlikely(err != 0))
5905 {
5906 jam();
5907 releaseSection(tmp);
5908 break;
5909 }
5910 // ndbrequire(!hasNull);
5911
5912 /**
5913 * Update size of subsrouting section, which contains arguments
5914 */
5915 SegmentedSectionPtr ptr;
5916 getSection(ptr, tmp);
5917 Uint32 new_size = ptr.sz;
5918 Uint32 * sectionptrs = ptr.p->theData;
5919 sectionptrs[4] = new_size - org_size;
5920
5921 treeNodePtr.p->m_send.m_attrInfoPtrI = tmp;
5922 }
5923
5924 /**
5925 * Now send...
5926 */
5927
5928 /**
5929 * TODO merge better with lookup_start (refactor)
5930 */
5931 {
5932 /* We set the upper half word of m_correlation to the tuple ID
5933 * of the parent, such that the API can match this tuple with its
5934 * parent.
5935 * Then we re-use the tuple ID of the parent as the
5936 * tuple ID for this tuple also. Since the tuple ID
5937 * is unique within this batch and SPJ block for the parent operation,
5938 * it must also be unique for this operation.
5939 * This ensures that lookup operations with no user projection will
5940 * work, since such operations will have the same tuple ID as their
5941 * parents. The API will then be able to match a tuple with its
5942 * grandparent, even if it gets no tuple for the parent operation.*/
5943 treeNodePtr.p->m_send.m_correlation =
5944 (corrVal << 16) + (corrVal & 0xffff);
5945
5946 treeNodePtr.p->m_send.m_ref = tmp.receiverRef;
5947 LqhKeyReq * dst = (LqhKeyReq*)treeNodePtr.p->m_lookup_data.m_lqhKeyReq;
5948 dst->hashValue = tmp.hashInfo[0];
5949 dst->fragmentData = tmp.fragId;
5950 Uint32 attrLen = 0;
5951 LqhKeyReq::setDistributionKey(attrLen, tmp.fragDistKey);
5952 dst->attrLen = attrLen;
5953 lookup_send(signal, requestPtr, treeNodePtr);
5954
5955 if (treeNodePtr.p->m_bits & TreeNode::T_ATTRINFO_CONSTRUCTED)
5956 {
5957 jam();
5958 // restore
5959 treeNodePtr.p->m_send.m_attrInfoPtrI = attrInfoPtrI;
5960 }
5961 }
5962 return;
5963 } while (0);
5964
5965 // If we fail it will always be a 'hard error' -> abort
5966 ndbrequire(err);
5967 jam();
5968 abort(signal, requestPtr, err);
5969 }
5970
5971 void
lookup_abort(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)5972 Dbspj::lookup_abort(Signal* signal,
5973 Ptr<Request> requestPtr,
5974 Ptr<TreeNode> treeNodePtr)
5975 {
5976 jam();
5977 // Correlation ids for deferred operations are allocated in the batch specific
5978 // arena. It is sufficient to release entire memory arena.
5979 m_arenaAllocator.release(treeNodePtr.p->m_batchArena);
5980 treeNodePtr.p->m_deferred.init();
5981 }
5982
5983 Uint32
lookup_execNODE_FAILREP(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const NdbNodeBitmask mask)5984 Dbspj::lookup_execNODE_FAILREP(Signal* signal,
5985 Ptr<Request> requestPtr,
5986 Ptr<TreeNode> treeNodePtr,
5987 const NdbNodeBitmask mask)
5988 {
5989 jam();
5990 Uint32 node = 0;
5991 Uint32 sum = 0;
5992 while (requestPtr.p->m_outstanding &&
5993 ((node = mask.find(node + 1)) != NdbNodeBitmask::NotFound))
5994 {
5995 Uint32 cnt = requestPtr.p->m_lookup_node_data[node];
5996 sum += cnt;
5997 requestPtr.p->m_lookup_node_data[node] = 0;
5998 }
5999
6000 if (sum)
6001 {
6002 jam();
6003 ndbrequire(requestPtr.p->m_outstanding >= sum);
6004 requestPtr.p->m_outstanding -= sum;
6005 }
6006
6007 return sum;
6008 }
6009
6010 void
lookup_cleanup(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)6011 Dbspj::lookup_cleanup(Ptr<Request> requestPtr,
6012 Ptr<TreeNode> treeNodePtr)
6013 {
6014 cleanup_common(requestPtr, treeNodePtr);
6015 }
6016
6017
6018 Uint32
handle_special_hash(Uint32 tableId,Uint32 dstHash[4],const Uint64 * src,Uint32 srcLen,const KeyDescriptor * desc)6019 Dbspj::handle_special_hash(Uint32 tableId, Uint32 dstHash[4],
6020 const Uint64* src,
6021 Uint32 srcLen, // Len in #32bit words
6022 const KeyDescriptor* desc)
6023 {
6024 const Uint32 MAX_KEY_SIZE_IN_LONG_WORDS=
6025 (MAX_KEY_SIZE_IN_WORDS + 1) / 2;
6026 Uint64 alignedWorkspace[MAX_KEY_SIZE_IN_LONG_WORDS * MAX_XFRM_MULTIPLY];
6027 const bool hasVarKeys = desc->noOfVarKeys > 0;
6028 const bool hasCharAttr = desc->hasCharAttr;
6029 const bool compute_distkey = desc->noOfDistrKeys > 0;
6030
6031 const Uint64 *hashInput = 0;
6032 Uint32 inputLen = 0;
6033 Uint32 keyPartLen[MAX_ATTRIBUTES_IN_INDEX];
6034 Uint32 * keyPartLenPtr;
6035
6036 /* Normalise KeyInfo into workspace if necessary */
6037 if (hasCharAttr || (compute_distkey && hasVarKeys))
6038 {
6039 hashInput = alignedWorkspace;
6040 keyPartLenPtr = keyPartLen;
6041 inputLen = xfrm_key_hash(tableId,
6042 (Uint32*)src,
6043 (Uint32*)alignedWorkspace,
6044 sizeof(alignedWorkspace) >> 2,
6045 keyPartLenPtr);
6046 if (unlikely(inputLen == 0))
6047 {
6048 return 290; // 'Corrupt key in TC, unable to xfrm'
6049 }
6050 }
6051 else
6052 {
6053 /* Keyinfo already suitable for hash */
6054 hashInput = src;
6055 inputLen = srcLen;
6056 keyPartLenPtr = 0;
6057 }
6058
6059 /* Calculate primary key hash */
6060 md5_hash(dstHash, hashInput, inputLen);
6061
6062 /* If the distribution key != primary key then we have to
6063 * form a distribution key from the primary key and calculate
6064 * a separate distribution hash based on this
6065 */
6066 if (compute_distkey)
6067 {
6068 jam();
6069
6070 Uint32 distrKeyHash[4];
6071 /* Reshuffle primary key columns to get just distribution key */
6072 Uint32 len = create_distr_key(tableId, (Uint32*)hashInput, (Uint32*)alignedWorkspace, keyPartLenPtr);
6073 /* Calculate distribution key hash */
6074 md5_hash(distrKeyHash, alignedWorkspace, len);
6075
6076 /* Just one word used for distribution */
6077 dstHash[1] = distrKeyHash[1];
6078 }
6079 return 0;
6080 }
6081
6082 Uint32
computeHash(Signal * signal,BuildKeyReq & dst,Uint32 tableId,Uint32 ptrI)6083 Dbspj::computeHash(Signal* signal,
6084 BuildKeyReq& dst, Uint32 tableId, Uint32 ptrI)
6085 {
6086 /**
6087 * Essentially the same code as in Dbtc::hash().
6088 * The code for user defined partitioning has been removed though.
6089 */
6090 SegmentedSectionPtr ptr;
6091 getSection(ptr, ptrI);
6092
6093 /* NOTE: md5_hash below require 64-bit alignment
6094 */
6095 const Uint32 MAX_KEY_SIZE_IN_LONG_WORDS=
6096 (MAX_KEY_SIZE_IN_WORDS + 1) / 2;
6097 Uint64 tmp64[MAX_KEY_SIZE_IN_LONG_WORDS];
6098 Uint32 *tmp32 = (Uint32*)tmp64;
6099 ndbassert(ptr.sz <= MAX_KEY_SIZE_IN_WORDS);
6100 copy(tmp32, ptr);
6101
6102 const KeyDescriptor* desc = g_key_descriptor_pool.getPtr(tableId);
6103 ndbrequire(desc != NULL);
6104
6105 bool need_special_hash = desc->hasCharAttr | (desc->noOfDistrKeys > 0);
6106 if (need_special_hash)
6107 {
6108 jam();
6109 return handle_special_hash(tableId, dst.hashInfo, tmp64, ptr.sz, desc);
6110 }
6111 else
6112 {
6113 jam();
6114 md5_hash(dst.hashInfo, tmp64, ptr.sz);
6115 return 0;
6116 }
6117 }
6118
6119 /**
6120 * This function differs from computeHash in that *ptrI*
6121 * only contains partition key (packed) and not full primary key
6122 */
6123 Uint32
computePartitionHash(Signal * signal,BuildKeyReq & dst,Uint32 tableId,Uint32 ptrI)6124 Dbspj::computePartitionHash(Signal* signal,
6125 BuildKeyReq& dst, Uint32 tableId, Uint32 ptrI)
6126 {
6127 SegmentedSectionPtr ptr;
6128 getSection(ptr, ptrI);
6129
6130 /* NOTE: md5_hash below require 64-bit alignment
6131 */
6132 const Uint32 MAX_KEY_SIZE_IN_LONG_WORDS=
6133 (MAX_KEY_SIZE_IN_WORDS + 1) / 2;
6134 Uint64 _space[MAX_KEY_SIZE_IN_LONG_WORDS];
6135 Uint64 *tmp64 = _space;
6136 Uint32 *tmp32 = (Uint32*)tmp64;
6137 Uint32 sz = ptr.sz;
6138 ndbassert(ptr.sz <= MAX_KEY_SIZE_IN_WORDS);
6139 copy(tmp32, ptr);
6140
6141 const KeyDescriptor* desc = g_key_descriptor_pool.getPtr(tableId);
6142 ndbrequire(desc != NULL);
6143
6144 bool need_xfrm = desc->hasCharAttr || desc->noOfVarKeys;
6145 if (need_xfrm)
6146 {
6147 jam();
6148 /**
6149 * xfrm distribution key
6150 */
6151 Uint32 srcPos = 0;
6152 Uint32 dstPos = 0;
6153 Uint32 * src = tmp32;
6154 Uint32 * dst = signal->theData+24;
6155 for (Uint32 i = 0; i < desc->noOfKeyAttr; i++)
6156 {
6157 const KeyDescriptor::KeyAttr& keyAttr = desc->keyAttr[i];
6158 if (AttributeDescriptor::getDKey(keyAttr.attributeDescriptor))
6159 {
6160 Uint32 attrLen =
6161 xfrm_attr_hash(keyAttr.attributeDescriptor, keyAttr.charsetInfo,
6162 src, srcPos, dst, dstPos,
6163 NDB_ARRAY_SIZE(signal->theData) - 24);
6164 if (unlikely(attrLen == 0))
6165 {
6166 DEBUG_CRASH();
6167 return 290; // 'Corrupt key in TC, unable to xfrm'
6168 }
6169 }
6170 }
6171 tmp64 = (Uint64*)dst;
6172 sz = dstPos;
6173 }
6174
6175 md5_hash(dst.hashInfo, tmp64, sz);
6176 return 0;
6177 }
6178
6179 /**
6180 * This method comes in with a list of nodes.
6181 * We have already verified that our own node
6182 * isn't in this list. If we have a node in this
6183 * list that is in the same location domain as
6184 * this node, it will be selected before any
6185 * other node. So we will always try to keep
6186 * the read coming from the same location domain.
6187 *
6188 * To avoid radical imbalances we provide a bit
6189 * of round robin on a node bases. It isn't
6190 * any perfect round robin. We simply rotate a
6191 * bit among the selected nodes instead of
6192 * always selecting the first one we find.
6193 */
6194 Uint32
check_own_location_domain(const Uint32 * nodes,Uint32 end)6195 Dbspj::check_own_location_domain(const Uint32 *nodes,
6196 Uint32 end)
6197 {
6198 Uint32 loc_nodes[MAX_NDB_NODES];
6199 Uint32 loc_node_count = 0;
6200 Uint32 my_location_domain_id =
6201 m_location_domain_id[getOwnNodeId()];
6202
6203 if (my_location_domain_id == 0)
6204 {
6205 jam();
6206 return 0;
6207 }
6208 for (Uint32 i = 0; i < end; i++)
6209 {
6210 jam();
6211 Uint32 node = nodes[i];
6212 ndbrequire(node != 0 && node < MAX_NDB_NODES);
6213 if (my_location_domain_id ==
6214 m_location_domain_id[node])
6215 {
6216 jam();
6217 loc_nodes[loc_node_count++] = node;
6218 }
6219 }
6220 if (loc_node_count != 0)
6221 {
6222 jam();
6223 /**
6224 * If many nodes in the same location domain we will
6225 * spread the load on them by using a very simple load
6226 * balancing routine.
6227 */
6228 m_load_balancer_location++;
6229 Uint32 ret_node = loc_nodes[m_load_balancer_location % loc_node_count];
6230 return ret_node;
6231 }
6232 return 0;
6233 }
6234
6235 Uint32
getNodes(Signal * signal,BuildKeyReq & dst,Uint32 tableId)6236 Dbspj::getNodes(Signal* signal, BuildKeyReq& dst, Uint32 tableId)
6237 {
6238 TableRecordPtr tablePtr;
6239 tablePtr.i = tableId;
6240 ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
6241
6242 DiGetNodesReq * req = (DiGetNodesReq *)&signal->theData[0];
6243 req->tableId = tableId;
6244 req->hashValue = dst.hashInfo[1];
6245 req->distr_key_indicator = 0; // userDefinedPartitioning not supported!
6246 req->scan_indicator = 0;
6247 req->anyNode = (tablePtr.p->m_flags & TableRecord::TR_FULLY_REPLICATED) != 0;
6248 req->get_next_fragid_indicator = 0;
6249 req->jamBufferPtr = jamBuffer();
6250
6251 EXECUTE_DIRECT_MT(DBDIH, GSN_DIGETNODESREQ, signal,
6252 DiGetNodesReq::SignalLength, 0);
6253 jamEntry();
6254
6255 DiGetNodesConf * conf = (DiGetNodesConf *)&signal->theData[0];
6256 const Uint32 err = signal->theData[0] ? signal->theData[1] : 0;
6257 Uint32 Tdata2 = conf->reqinfo;
6258 Uint32 nodeId = conf->nodes[0];
6259 Uint32 instanceKey = (Tdata2 >> 24) & 127;
6260
6261 DEBUG("HASH to nodeId:" << nodeId << ", instanceKey:" << instanceKey);
6262
6263 jamEntry();
6264 if (unlikely(err != 0))
6265 {
6266 jam();
6267 goto error;
6268 }
6269
6270 /**
6271 * SPJ only does committed-read (for now)
6272 * so it's always ok to READ_BACKUP
6273 * if applicable
6274 *
6275 */
6276 if (nodeId != getOwnNodeId() &&
6277 tablePtr.p->m_flags & TableRecord::TR_READ_BACKUP)
6278 {
6279 /* Node cnt from DIH ignores primary, presumably to fit in 2 bits */
6280 Uint32 cnt = (Tdata2 & 3) + 1;
6281 for (Uint32 i = 1; i < cnt; i++)
6282 {
6283 jam();
6284 if (conf->nodes[i] == getOwnNodeId())
6285 {
6286 jam();
6287 nodeId = getOwnNodeId();
6288 break;
6289 }
6290 }
6291 if (nodeId != getOwnNodeId())
6292 {
6293 Uint32 node;
6294 jam();
6295 if ((node = check_own_location_domain(&conf->nodes[0],
6296 cnt)) != 0)
6297 {
6298 nodeId = node;
6299 }
6300 }
6301 }
6302
6303 dst.fragId = conf->fragId;
6304 dst.fragDistKey = (Tdata2 >> 16) & 255;
6305 dst.receiverRef = numberToRef(DBLQH, instanceKey, nodeId);
6306
6307 return 0;
6308
6309 error:
6310 return err;
6311 }
6312
6313 bool
lookup_checkNode(const Ptr<Request> requestPtr,const Ptr<TreeNode> treeNodePtr)6314 Dbspj::lookup_checkNode(const Ptr<Request> requestPtr,
6315 const Ptr<TreeNode> treeNodePtr)
6316 {
6317 jam();
6318
6319 /* TODO */
6320
6321 return true;
6322 }
6323
6324 void
lookup_dumpNode(const Ptr<Request> requestPtr,const Ptr<TreeNode> treeNodePtr)6325 Dbspj::lookup_dumpNode(const Ptr<Request> requestPtr,
6326 const Ptr<TreeNode> treeNodePtr)
6327 {
6328 jam();
6329
6330 const LookupData& data = treeNodePtr.p->m_lookup_data;
6331
6332 g_eventLogger->info("DBSPJ %u : LOOKUP api_resultRef 0x%x "
6333 "resultData %u outstanding %u",
6334 instance(),
6335 data.m_api_resultRef,
6336 data.m_api_resultData,
6337 data.m_outstanding);
6338
6339 /* TODO : Dump LQHKEYREQ */
6340 }
6341
6342 /**
6343 * END - MODULE LOOKUP
6344 */
6345
6346 /**
6347 * MODULE SCAN FRAGMENT
6348 *
6349 * NOTE: This may not be root-node
6350 */
6351 const Dbspj::OpInfo
6352 Dbspj::g_ScanFragOpInfo =
6353 {
6354 &Dbspj::scanFrag_build,
6355 &Dbspj::scanFrag_prepare,
6356 &Dbspj::scanFrag_start,
6357 &Dbspj::scanFrag_countSignal,
6358 0, // execLQHKEYREF
6359 0, // execLQHKEYCONF
6360 &Dbspj::scanFrag_execSCAN_FRAGREF,
6361 &Dbspj::scanFrag_execSCAN_FRAGCONF,
6362 &Dbspj::scanFrag_parent_row,
6363 &Dbspj::scanFrag_parent_batch_complete,
6364 &Dbspj::scanFrag_parent_batch_repeat,
6365 &Dbspj::scanFrag_parent_batch_cleanup,
6366 &Dbspj::scanFrag_execSCAN_NEXTREQ,
6367 &Dbspj::scanFrag_complete,
6368 &Dbspj::scanFrag_abort,
6369 &Dbspj::scanFrag_execNODE_FAILREP,
6370 &Dbspj::scanFrag_cleanup,
6371 &Dbspj::scanFrag_checkNode,
6372 &Dbspj::scanFrag_dumpNode
6373 };
6374
6375 Uint32
scanFrag_build(Build_context & ctx,Ptr<Request> requestPtr,const QueryNode * qn,const QueryNodeParameters * qp)6376 Dbspj::scanFrag_build(Build_context& ctx,
6377 Ptr<Request> requestPtr,
6378 const QueryNode* qn,
6379 const QueryNodeParameters* qp)
6380 {
6381 Uint32 err = 0;
6382 Ptr<TreeNode> treeNodePtr;
6383 const QN_ScanFragNode * node = (const QN_ScanFragNode*)qn;
6384 const QN_ScanFragParameters * param = (const QN_ScanFragParameters*)qp;
6385
6386 // Only scan requests can have scan-TreeNodes
6387 ndbassert(requestPtr.p->isScan());
6388
6389 do
6390 {
6391 jam();
6392 err = DbspjErr::InvalidTreeNodeSpecification;
6393 DEBUG("scanFrag_build: len=" << node->len);
6394 if (unlikely(node->len < QN_ScanFragNode::NodeSize))
6395 {
6396 jam();
6397 break;
6398 }
6399
6400 err = DbspjErr::InvalidTreeParametersSpecification;
6401 DEBUG("param len: " << param->len);
6402 if (unlikely(param->len < QN_ScanFragParameters::NodeSize))
6403 {
6404 jam();
6405 break;
6406 }
6407
6408 err = createNode(ctx, requestPtr, treeNodePtr);
6409 if (unlikely(err != 0))
6410 {
6411 jam();
6412 break;
6413 }
6414
6415 const Uint32 treeBits = node->requestInfo;
6416 const Uint32 paramBits = param->requestInfo;
6417 const Uint32 batchRows = param->batch_size_rows;
6418 const Uint32 batchBytes = param->batch_size_bytes;
6419 const Uint32 indexId = node->tableId;
6420 const Uint32 tableId = g_key_descriptor_pool.getPtr(indexId)->primaryTableId;
6421
6422 treeNodePtr.p->m_info = &g_ScanFragOpInfo;
6423 treeNodePtr.p->m_tableOrIndexId = indexId;
6424 treeNodePtr.p->m_primaryTableId = tableId;
6425 treeNodePtr.p->m_schemaVersion = node->tableVersion;
6426 treeNodePtr.p->m_bits |= TreeNode::T_ATTR_INTERPRETED;
6427 treeNodePtr.p->m_batch_size = batchRows;
6428
6429 ctx.m_resultData = param->resultData;
6430
6431 /**
6432 * Parse stuff
6433 */
6434 struct DABuffer nodeDA, paramDA;
6435 nodeDA.ptr = node->optional;
6436 nodeDA.end = nodeDA.ptr + (node->len - QN_ScanFragNode::NodeSize);
6437 paramDA.ptr = param->optional;
6438 paramDA.end = paramDA.ptr + (param->len - QN_ScanFragParameters::NodeSize);
6439
6440 err = parseScanFrag(ctx, requestPtr, treeNodePtr,
6441 nodeDA, treeBits, paramDA, paramBits);
6442
6443 if (unlikely(err != 0))
6444 {
6445 jam();
6446 break;
6447 }
6448
6449 /**
6450 * If there exists other scan TreeNodes not being among
6451 * my ancestors, results from this scanFrag may be repeated
6452 * as part of an X-scan.
6453 *
6454 * NOTE: The scan nodes being along the left deep ancestor chain
6455 * are not 'repeatable' as they are driving the
6456 * repeated X-scan and are thus not repeated themself.
6457 */
6458 if (requestPtr.p->m_bits & Request::RT_REPEAT_SCAN_RESULT &&
6459 !treeNodePtr.p->m_ancestors.contains(ctx.m_scans))
6460 {
6461 treeNodePtr.p->m_bits |= TreeNode::T_SCAN_REPEATABLE;
6462 }
6463
6464 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
6465 ScanFragReq* const dst = reinterpret_cast<ScanFragReq*>(data.m_scanFragReq);
6466
6467 /**
6468 * The root node get most of its ScanFragReq contents readily
6469 * filled in from the 'start_signal'. So building the initial
6470 * contents of the m_scanFragReq has to be handled different
6471 * for the root node vs. a non-root node.
6472 */
6473 if (ctx.m_start_signal) //Is the root node?
6474 {
6475 jam();
6476 ndbassert(treeNodePtr.p->m_parentPtrI == RNIL);
6477
6478 /**
6479 * The REQuest in 'start_signal' contains most of the m_scanFragReq
6480 * readilly filled in. Copy it, and modify where needed.
6481 */
6482 const Signal* signal = ctx.m_start_signal;
6483 const ScanFragReq* const req = reinterpret_cast<const ScanFragReq*>(signal->getDataPtr());
6484 memcpy(dst, req, sizeof(data.m_scanFragReq));
6485
6486 // Assert some limitations on the SPJ supported ScanFragReq
6487 ndbassert(ScanFragReq::getLockMode(req->requestInfo) == 0);
6488 ndbassert(ScanFragReq::getHoldLockFlag(req->requestInfo) == 0);
6489 ndbassert(ScanFragReq::getKeyinfoFlag(req->requestInfo) == 0);
6490 ndbassert(ScanFragReq::getReadCommittedFlag(req->requestInfo) == 1);
6491 ndbassert(ScanFragReq::getLcpScanFlag(req->requestInfo) == 0);
6492 ndbassert(ScanFragReq::getReorgFlag(req->requestInfo) == ScanFragReq::REORG_ALL);
6493
6494 /**
6495 * 'NoDiskFlag' should agree with information in treeNode
6496 */
6497 ndbassert(ScanFragReq::getNoDiskFlag(req->requestInfo) ==
6498 ((treeBits & DABits::NI_LINKED_DISK) == 0 &&
6499 (paramBits & DABits::PI_DISK_ATTR) == 0));
6500
6501 ndbassert(dst->savePointId == ctx.m_savepointId);
6502 ndbassert(dst->tableId == node->tableId);
6503 ndbassert(dst->schemaVersion == node->tableVersion);
6504 ndbassert(dst->transId1 == requestPtr.p->m_transId[0]);
6505 ndbassert(dst->transId2 == requestPtr.p->m_transId[1]);
6506
6507 treeNodePtr.p->m_bits |= TreeNode::T_ONE_SHOT;
6508
6509 TableRecordPtr tablePtr;
6510 tablePtr.i = treeNodePtr.p->m_tableOrIndexId;
6511 ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
6512 const bool readBackup =
6513 (tablePtr.p->m_flags & TableRecord::TR_READ_BACKUP) != 0;
6514
6515 data.m_fragCount = 0;
6516
6517 /**
6518 * As this is the root node, fragId is already contained in the REQuest
6519 * Fill in the set of 'm_fragments' to be SCAN'ed by this REQ.
6520 */
6521 {
6522 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
6523
6524 if (ScanFragReq::getMultiFragFlag(req->requestInfo))
6525 {
6526 jam();
6527 Uint32 variableLen = 25;
6528 data.m_fragCount = signal->theData[variableLen++];
6529 for (Uint32 i=0; i < data.m_fragCount; i++)
6530 {
6531 jam();
6532 Ptr<ScanFragHandle> fragPtr;
6533 const Uint32 fragId = signal->theData[variableLen++];
6534 const Uint32 ref = numberToRef(DBLQH,
6535 getInstanceKey(req->tableId, fragId),
6536 getOwnNodeId());
6537
6538 DEBUG("Scan build, fragId: " << fragId << ", ref: " << ref);
6539
6540 if (!ERROR_INSERTED_CLEAR(17004) &&
6541 likely(m_scanfraghandle_pool.seize(requestPtr.p->m_arena, fragPtr)))
6542 {
6543 fragPtr.p->init(fragId, readBackup);
6544 fragPtr.p->m_treeNodePtrI = treeNodePtr.i;
6545 fragPtr.p->m_ref = ref;
6546 list.addLast(fragPtr);
6547 }
6548 else
6549 {
6550 jam();
6551 err = DbspjErr::OutOfQueryMemory;
6552 return err;
6553 }
6554 }
6555 }
6556 else // 'not getMultiFragFlag(req->requestInfo)'
6557 {
6558 jam();
6559 Ptr<ScanFragHandle> fragPtr;
6560 data.m_fragCount = 1;
6561
6562 const Uint32 ref =
6563 numberToRef(DBLQH,
6564 getInstanceKey(req->tableId, req->fragmentNoKeyLen),
6565 getOwnNodeId());
6566
6567 if (!ERROR_INSERTED_CLEAR(17004) &&
6568 likely(m_scanfraghandle_pool.seize(requestPtr.p->m_arena, fragPtr)))
6569 {
6570 jam();
6571 fragPtr.p->init(req->fragmentNoKeyLen, readBackup);
6572 fragPtr.p->m_treeNodePtrI = treeNodePtr.i;
6573 fragPtr.p->m_ref = ref;
6574 list.addLast(fragPtr);
6575 }
6576 else
6577 {
6578 jam();
6579 err = DbspjErr::OutOfQueryMemory;
6580 return err;
6581 }
6582 }
6583 requestPtr.p->m_rootFragCnt = data.m_fragCount;
6584 }
6585
6586 if (ScanFragReq::getRangeScanFlag(req->requestInfo))
6587 {
6588 c_Counters.incr_counter(CI_RANGE_SCANS_RECEIVED, 1);
6589 }
6590 else
6591 {
6592 c_Counters.incr_counter(CI_TABLE_SCANS_RECEIVED, 1);
6593 }
6594 }
6595 else
6596 {
6597 requestPtr.p->m_bits |= Request::RT_NEED_PREPARE;
6598 requestPtr.p->m_bits |= Request::RT_NEED_COMPLETE;
6599
6600 treeNodePtr.p->m_bits |= TreeNode::T_NEED_PREPARE;
6601 treeNodePtr.p->m_bits |= TreeNode::T_NEED_COMPLETE;
6602 treeNodePtr.p->m_bits |= TreeNode::T_NEED_REPORT_BATCH_COMPLETED;
6603
6604 dst->tableId = node->tableId;
6605 dst->schemaVersion = node->tableVersion;
6606 dst->fragmentNoKeyLen = 0xff; //Filled in after 'prepare'
6607 dst->savePointId = ctx.m_savepointId;
6608 dst->transId1 = requestPtr.p->m_transId[0];
6609 dst->transId2 = requestPtr.p->m_transId[1];
6610
6611 Uint32 requestInfo = 0;
6612 ScanFragReq::setReadCommittedFlag(requestInfo, 1);
6613 ScanFragReq::setScanPrio(requestInfo, ctx.m_scanPrio);
6614 ScanFragReq::setRangeScanFlag(requestInfo, 1);
6615 ScanFragReq::setNoDiskFlag(requestInfo,
6616 (treeBits & DABits::NI_LINKED_DISK) == 0 &&
6617 (paramBits & DABits::PI_DISK_ATTR) == 0);
6618
6619 if (treeBits & DABits::NI_FIRST_MATCH && treeNodePtr.p->isLeaf())
6620 {
6621 // Can only push firstMatch elimination to data nodes if results does
6622 // not depends of finding matches from children -> has to be a leaf
6623 ScanFragReq::setFirstMatchFlag(requestInfo, 1);
6624 }
6625 if (treeBits & DABits::NI_ANTI_JOIN && treeNodePtr.p->isLeaf())
6626 {
6627 // ANTI_JOIN's cares about whether a match was found or not
6628 // Thus, returning only the first match is sufficient here as well
6629 ScanFragReq::setFirstMatchFlag(requestInfo, 1);
6630 }
6631 dst->requestInfo = requestInfo;
6632 }
6633
6634 // Common part whether root or not
6635 dst->senderData = treeNodePtr.i;
6636 dst->resultRef = reference();
6637 dst->resultData = treeNodePtr.i;
6638 ScanFragReq::setCorrFactorFlag(dst->requestInfo, 1);
6639 ScanFragReq::setMultiFragFlag(dst->requestInfo, 0);
6640
6641 dst->batch_size_rows = batchRows;
6642 dst->batch_size_bytes = batchBytes;
6643
6644 ctx.m_scan_cnt++;
6645 ctx.m_scans.set(treeNodePtr.p->m_node_no);
6646
6647 return 0;
6648 } while (0);
6649
6650 return err;
6651 }
6652
6653 Uint32
parseScanFrag(Build_context & ctx,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,DABuffer tree,Uint32 treeBits,DABuffer param,Uint32 paramBits)6654 Dbspj::parseScanFrag(Build_context& ctx,
6655 Ptr<Request> requestPtr,
6656 Ptr<TreeNode> treeNodePtr,
6657 DABuffer tree, Uint32 treeBits,
6658 DABuffer param, Uint32 paramBits)
6659 {
6660 Uint32 err = 0;
6661
6662 typedef QN_ScanFragNode Node;
6663 typedef QN_ScanFragParameters Params;
6664
6665 do
6666 {
6667 jam();
6668
6669 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
6670 data.m_fragments.init();
6671 data.m_fragCount = 0;
6672 data.m_frags_outstanding = 0;
6673 data.m_frags_complete = 0;
6674 data.m_frags_not_started = 0;
6675 data.m_parallelismStat.init();
6676 data.m_batch_chunks = 0;
6677
6678 /**
6679 * We will need to look at the parameters again if the scan is pruned and the prune
6680 * key uses parameter values. Therefore, we keep a reference to the start of the
6681 * parameter buffer.
6682 */
6683 DABuffer origParam = param;
6684 err = parseDA(ctx, requestPtr, treeNodePtr,
6685 tree, treeBits, param, paramBits);
6686 if (unlikely(err != 0))
6687 break;
6688
6689 if (treeBits & Node::SF_PRUNE_PATTERN)
6690 {
6691 Uint32 len_cnt = * tree.ptr ++;
6692 Uint32 len = len_cnt & 0xFFFF; // length of pattern in words
6693 Uint32 cnt = len_cnt >> 16; // no of parameters
6694
6695 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
6696 ndbrequire((cnt==0) == ((treeBits & Node::SF_PRUNE_PARAMS) ==0));
6697 ndbrequire((cnt==0) == ((paramBits & Params::SFP_PRUNE_PARAMS)==0));
6698
6699 if (treeBits & Node::SF_PRUNE_LINKED)
6700 {
6701 jam();
6702 DEBUG("LINKED-PRUNE PATTERN w/ " << cnt << " PARAM values");
6703
6704 data.m_prunePattern.init();
6705 Local_pattern_store pattern(pool, data.m_prunePattern);
6706
6707 /**
6708 * Expand pattern into a new pattern (with linked values)
6709 */
6710 err = expand(pattern, treeNodePtr, tree, len, origParam, cnt);
6711 if (unlikely(err != 0))
6712 {
6713 jam();
6714 break;
6715 }
6716 treeNodePtr.p->m_bits |= TreeNode::T_PRUNE_PATTERN;
6717 c_Counters.incr_counter(CI_PRUNED_RANGE_SCANS_RECEIVED, 1);
6718 }
6719 else
6720 {
6721 jam();
6722 DEBUG("FIXED-PRUNE w/ " << cnt << " PARAM values");
6723
6724 /**
6725 * Expand pattern directly into
6726 * This means a "fixed" pruning from here on
6727 * i.e guaranteed single partition
6728 */
6729 Uint32 prunePtrI = RNIL;
6730 bool hasNull;
6731 err = expand(prunePtrI, tree, len, origParam, cnt, hasNull);
6732 if (unlikely(err != 0))
6733 {
6734 jam();
6735 releaseSection(prunePtrI);
6736 break;
6737 }
6738
6739 if (unlikely(hasNull))
6740 {
6741 /* API should have elliminated requests w/ const-NULL keys */
6742 jam();
6743 DEBUG("BEWARE: T_CONST_PRUNE-key contain NULL values");
6744 releaseSection(prunePtrI);
6745 // treeNodePtr.p->m_bits |= TreeNode::T_NULL_PRUNE;
6746 // break;
6747 ndbabort();
6748 }
6749 ndbrequire(prunePtrI != RNIL); /* todo: can we allow / take advantage of NULLs in range scan? */
6750 data.m_constPrunePtrI = prunePtrI;
6751
6752 /**
6753 * We may not compute the partition for the hash-key here
6754 * as we have not yet opened a read-view
6755 */
6756 treeNodePtr.p->m_bits |= TreeNode::T_CONST_PRUNE;
6757 c_Counters.incr_counter(CI_CONST_PRUNED_RANGE_SCANS_RECEIVED, 1);
6758 }
6759 } //SF_PRUNE_PATTERN
6760
6761 if ((treeNodePtr.p->m_bits & TreeNode::T_CONST_PRUNE) == 0 &&
6762 ((treeBits & Node::SF_PARALLEL) ||
6763 (paramBits & Params::SFP_PARALLEL)))
6764 {
6765 jam();
6766 treeNodePtr.p->m_bits |= TreeNode::T_SCAN_PARALLEL;
6767 }
6768
6769 if (paramBits & Params::SFP_SORTED_ORDER)
6770 {
6771 jam();
6772 treeNodePtr.p->m_bits |= TreeNode::T_SORTED_ORDER;
6773 }
6774
6775 return 0;
6776 } while(0);
6777
6778 jam();
6779 return err;
6780 }
6781
6782 void
scanFrag_prepare(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)6783 Dbspj::scanFrag_prepare(Signal * signal,
6784 Ptr<Request> requestPtr, Ptr<TreeNode> treeNodePtr)
6785 {
6786 jam();
6787
6788 if (!ERROR_INSERTED(17521)) // Avoid emulated rnd errors
6789 {
6790 // ::checkTableError() should be handled before we reach this far
6791 ndbassert(checkTableError(treeNodePtr) == 0); //Handled in Dbspj::start
6792 }
6793 ndbassert(treeNodePtr.p->m_state == TreeNode::TN_BUILDING);
6794 treeNodePtr.p->m_state = TreeNode::TN_PREPARING;
6795
6796 requestPtr.p->m_outstanding++;
6797
6798 DihScanTabReq * req = (DihScanTabReq*)signal->getDataPtrSend();
6799 req->tableId = treeNodePtr.p->m_tableOrIndexId;
6800 req->schemaTransId = 0;
6801 req->jamBufferPtr = jamBuffer();
6802
6803 EXECUTE_DIRECT_MT(DBDIH, GSN_DIH_SCAN_TAB_REQ, signal,
6804 DihScanTabReq::SignalLength, 0);
6805
6806 DihScanTabConf * conf = (DihScanTabConf*)signal->getDataPtr();
6807 Uint32 senderData = conf->senderData;
6808 conf->senderData = treeNodePtr.i;
6809 /**
6810 * We need to introduce real-time break here for 2 reasons. The first
6811 * is that it is required by real-time break rules. We can start an
6812 * arbitrary number of prepare scans here. So it is necessary to do a
6813 * real-time break here to ensure that we don't execute for too long
6814 * without real-time breaks.
6815 *
6816 * The second reason is that the caller is looping over the list
6817 * of tree nodes and so we can't change this list while he is
6818 * looping over it. So we introduce a real-time break to ensure that
6819 * the caller only starts up prepare messages and don't actually
6820 * perform all of them.
6821 */
6822 if (senderData == 0)
6823 {
6824 sendSignal(reference(),
6825 GSN_DIH_SCAN_TAB_CONF,
6826 signal,
6827 DihScanTabConf::SignalLength,
6828 JBB);
6829 return;
6830 }
6831 else
6832 {
6833 sendSignal(reference(),
6834 GSN_DIH_SCAN_TAB_REF,
6835 signal,
6836 DihScanTabRef::SignalLength,
6837 JBB);
6838 return;
6839 }
6840 }
6841
6842 void
execDIH_SCAN_TAB_REF(Signal * signal)6843 Dbspj::execDIH_SCAN_TAB_REF(Signal* signal)
6844 {
6845 jamEntry();
6846 DihScanTabRef * ref = (DihScanTabRef*)signal->getDataPtr();
6847
6848 Ptr<TreeNode> treeNodePtr;
6849 m_treenode_pool.getPtr(treeNodePtr, ref->senderData);
6850 Ptr<Request> requestPtr;
6851 m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
6852
6853 ndbrequire(requestPtr.p->isScan());
6854 ndbrequire(requestPtr.p->m_outstanding >= 1);
6855 requestPtr.p->m_outstanding -= 1;
6856 Uint32 errCode = ref->error;
6857 abort(signal, requestPtr, errCode);
6858 }
6859
6860 void
execDIH_SCAN_TAB_CONF(Signal * signal)6861 Dbspj::execDIH_SCAN_TAB_CONF(Signal* signal)
6862 {
6863 jamEntry();
6864 DihScanTabConf * conf = (DihScanTabConf*)signal->getDataPtr();
6865
6866 Ptr<TreeNode> treeNodePtr;
6867 m_treenode_pool.getPtr(treeNodePtr, conf->senderData);
6868
6869 ndbrequire(treeNodePtr.p->m_info == &g_ScanFragOpInfo);
6870
6871 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
6872
6873 Uint32 cookie = conf->scanCookie;
6874 Uint32 fragCount = conf->fragmentCount;
6875
6876 if (conf->reorgFlag)
6877 {
6878 jam();
6879 ScanFragReq * dst = (ScanFragReq*)data.m_scanFragReq;
6880 ScanFragReq::setReorgFlag(dst->requestInfo, ScanFragReq::REORG_NOT_MOVED);
6881 }
6882 if (treeNodePtr.p->m_bits & TreeNode::T_CONST_PRUNE)
6883 {
6884 jam();
6885 fragCount = 1;
6886 }
6887 data.m_fragCount = fragCount;
6888 data.m_scanCookie = cookie;
6889
6890 const Uint32 prunemask = TreeNode::T_PRUNE_PATTERN | TreeNode::T_CONST_PRUNE;
6891 bool pruned = (treeNodePtr.p->m_bits & prunemask) != 0;
6892
6893 TableRecordPtr tablePtr;
6894 tablePtr.i = treeNodePtr.p->m_tableOrIndexId;
6895 ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
6896 const bool readBackup =
6897 (tablePtr.p->m_flags & TableRecord::TR_READ_BACKUP) != 0;
6898
6899 Ptr<Request> requestPtr;
6900 m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
6901 ndbassert(requestPtr.p->m_outstanding > 0);
6902 requestPtr.p->m_outstanding--;
6903
6904 // Add a skew in the fragment lists such that we don't scan
6905 // the same subset of frags from all SPJ requests in case of
6906 // the scan not being 'T_SCAN_PARALLEL'
6907 Uint16 fragNoOffs = (getOwnNodeId()*requestPtr.p->m_rootFragCnt) % fragCount;
6908 Uint32 err = 0;
6909
6910 do
6911 {
6912 Ptr<ScanFragHandle> fragPtr;
6913
6914 /** Allocate & init all 'fragCnt' fragment desriptors */
6915 {
6916 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
6917
6918 err = checkTableError(treeNodePtr);
6919 if (unlikely(err != 0))
6920 {
6921 jam();
6922 break;
6923 }
6924 for (Uint32 i = 0; i<fragCount; i++)
6925 {
6926 Ptr<ScanFragHandle> fragPtr;
6927 Uint16 fragNo = (fragNoOffs+i) % fragCount;
6928
6929 if (!ERROR_INSERTED_CLEAR(17012) &&
6930 likely(m_scanfraghandle_pool.seize(requestPtr.p->m_arena, fragPtr)))
6931 {
6932 jam();
6933 fragPtr.p->init(fragNo, readBackup);
6934 fragPtr.p->m_treeNodePtrI = treeNodePtr.i;
6935 list.addLast(fragPtr);
6936 }
6937 else
6938 {
6939 jam();
6940 err = DbspjErr::OutOfQueryMemory;
6941 goto error;
6942 }
6943 }
6944 list.first(fragPtr); // Needed if T_CONST_PRUNE
6945 } // end 'Alloc scope'
6946
6947 if (treeNodePtr.p->m_bits & TreeNode::T_CONST_PRUNE)
6948 {
6949 jam();
6950
6951 // TODO we need a different variant of computeHash here,
6952 // since m_constPrunePtrI does not contain full primary key
6953 // but only parts in distribution key
6954
6955 BuildKeyReq tmp;
6956 Uint32 tableId = treeNodePtr.p->m_primaryTableId;
6957 err = computePartitionHash(signal, tmp, tableId, data.m_constPrunePtrI);
6958 if (unlikely(err != 0))
6959 {
6960 jam();
6961 break;
6962 }
6963
6964 releaseSection(data.m_constPrunePtrI);
6965 data.m_constPrunePtrI = RNIL;
6966
6967 err = getNodes(signal, tmp, tableId);
6968 if (unlikely(err != 0))
6969 {
6970 jam();
6971 break;
6972 }
6973
6974 fragPtr.p->m_fragId = tmp.fragId;
6975 fragPtr.p->m_ref = tmp.receiverRef;
6976 ndbassert(data.m_fragCount == 1);
6977 }
6978 else if (fragCount == 1)
6979 {
6980 jam();
6981 /**
6982 * This is roughly equivalent to T_CONST_PRUNE
6983 * pretend that it is const-pruned
6984 */
6985 if (treeNodePtr.p->m_bits & TreeNode::T_PRUNE_PATTERN)
6986 {
6987 jam();
6988 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
6989 Local_pattern_store pattern(pool, data.m_prunePattern);
6990 pattern.release();
6991 }
6992 data.m_constPrunePtrI = RNIL;
6993 Uint32 clear = TreeNode::T_PRUNE_PATTERN | TreeNode::T_SCAN_PARALLEL;
6994 treeNodePtr.p->m_bits &= ~clear;
6995 treeNodePtr.p->m_bits |= TreeNode::T_CONST_PRUNE;
6996
6997 /**
6998 * We must get fragPtr.p->m_ref...so set pruned=false
6999 */
7000 pruned = false;
7001 }
7002 data.m_frags_complete = data.m_fragCount;
7003
7004 if (!pruned)
7005 {
7006 /** Start requesting node info from DIH */
7007 jam();
7008 ndbassert(data.m_frags_outstanding == 0);
7009 data.m_frags_outstanding = data.m_fragCount;
7010 requestPtr.p->m_outstanding++;
7011
7012 err = scanFrag_sendDihGetNodesReq(signal, requestPtr, treeNodePtr);
7013 if (unlikely(err != 0))
7014 {
7015 jam();
7016 break;
7017 }
7018 }
7019 else
7020 {
7021 jam();
7022 treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
7023 }
7024
7025 ndbassert(err == 0);
7026 checkPrepareComplete(signal, requestPtr);
7027 return;
7028 } while (0);
7029
7030 error:
7031 jam();
7032 ndbassert(err != 0);
7033 abort(signal, requestPtr, err);
7034 checkBatchComplete(signal, requestPtr);
7035 }
7036
7037 /**
7038 * Will check the fragment list for fragments which need to
7039 * get node info to construct 'fragPtr.p->m_ref' from DIH.
7040 *
7041 * In order to avoid CPU starvation, or unmanagable huge FragItem[],
7042 * max MAX_DIH_FRAG_REQS are requested in a single signal.
7043 * If there are more fragments, we have to repeatable call this
7044 * function when CONF for the first fragment set is received.
7045 */
7046 Uint32
scanFrag_sendDihGetNodesReq(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)7047 Dbspj::scanFrag_sendDihGetNodesReq(Signal* signal,
7048 Ptr<Request> requestPtr,
7049 Ptr<TreeNode> treeNodePtr)
7050 {
7051 jam();
7052 Uint32 err = 0;
7053 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7054 Uint32 tableId = treeNodePtr.p->m_tableOrIndexId;
7055 TableRecordPtr tablePtr;
7056 Ptr<ScanFragHandle> fragPtr;
7057 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
7058 tablePtr.i = tableId;
7059 ptrCheckGuard(tablePtr, c_tabrecFilesize, m_tableRecord);
7060 Uint32 readAny = tablePtr.p->m_flags & TableRecord::TR_FULLY_REPLICATED ?
7061 1 : 0;
7062
7063 ndbassert(data.m_frags_outstanding > 0);
7064
7065 Uint32 fragCnt = 0;
7066 for (list.first(fragPtr);
7067 !fragPtr.isNull();
7068 list.next(fragPtr))
7069 {
7070 jam();
7071 ndbassert(requestPtr.p->m_outstanding > 0);
7072 ndbassert(data.m_frags_outstanding > 0);
7073
7074 if (fragCnt >= DiGetNodesReq::MAX_DIGETNODESREQS ||
7075 (ERROR_INSERTED(17131) && fragCnt >= 1))
7076 {
7077 jam();
7078 signal->theData[0] = 3;
7079 signal->theData[1] = treeNodePtr.i;
7080 sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
7081 break; //to exit
7082 }
7083
7084 if (fragPtr.p->m_ref == 0) // Need GSN_DIGETNODRESREQ
7085 {
7086 jam();
7087 DiGetNodesReq * const req = (DiGetNodesReq *)&signal->theData[0];
7088
7089 req->tableId = treeNodePtr.p->m_tableOrIndexId;
7090 req->hashValue = fragPtr.p->m_fragId;
7091 req->distr_key_indicator = ZTRUE;
7092 req->scan_indicator = ZTRUE;
7093 req->anyNode = readAny;
7094 req->get_next_fragid_indicator = 0;
7095 req->jamBufferPtr = jamBuffer();
7096
7097 EXECUTE_DIRECT_MT(DBDIH, GSN_DIGETNODESREQ, signal,
7098 DiGetNodesReq::SignalLength, 0);
7099
7100 const Uint32 errCode = signal->theData[0];
7101
7102 if (ERROR_INSERTED_CLEAR(17130) && requestPtr.p->m_outstanding == 1)
7103 {
7104 jamEntry();
7105 data.m_frags_outstanding = 0;
7106 err= DbspjErr::OutOfSectionMemory;
7107 break;
7108 }
7109 else if (unlikely(errCode))
7110 {
7111 jamEntry();
7112 data.m_frags_outstanding = 0;
7113 err= errCode;
7114 break;
7115 }
7116
7117 const DiGetNodesConf * conf = (DiGetNodesConf *)&signal->theData[0];
7118 //if (!errCode)
7119 {
7120 /**
7121 * Get instance key from upper bits except most significant bit which
7122 * is used reorg moving flag.
7123 */
7124 jamEntry();
7125 /* Node cnt from DIH ignores primary, presumably to fit in 2 bits */
7126 Uint32 cnt = (conf->reqinfo & 3) + 1;
7127 Uint32 instanceKey = (conf->reqinfo >> 24) & 127;
7128 NodeId nodeId = conf->nodes[0];
7129 if (nodeId != getOwnNodeId() &&
7130 fragPtr.p->m_readBackup)
7131 {
7132 for (Uint32 i = 1; i < cnt; i++)
7133 {
7134 jam();
7135 if (conf->nodes[i] == getOwnNodeId())
7136 {
7137 jam();
7138 nodeId = getOwnNodeId();
7139 break;
7140 }
7141 }
7142 if (nodeId != getOwnNodeId())
7143 {
7144 Uint32 node;
7145 jam();
7146 if ((node = check_own_location_domain(&conf->nodes[0],
7147 cnt)) != 0)
7148 {
7149 nodeId = node;
7150 }
7151 }
7152 }
7153 fragPtr.p->m_ref = numberToRef(DBLQH, instanceKey, nodeId);
7154 /**
7155 * For Fully replicated tables we can change the fragment id to a local
7156 * fragment as part of DIGETNODESREQ. So set it again here.
7157 */
7158 fragPtr.p->m_fragId = conf->fragId;
7159 }
7160
7161 fragCnt++;
7162 ndbassert(data.m_frags_outstanding > 0);
7163 ndbassert(treeNodePtr.p->m_state != TreeNode::TN_INACTIVE);
7164 data.m_frags_outstanding--;
7165 }
7166 }
7167 jam();
7168
7169 if (data.m_frags_outstanding == 0)
7170 {
7171 jam();
7172 treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
7173 requestPtr.p->m_outstanding--;
7174 }
7175 return err;
7176 } //Dbspj::scanFrag_sendDihGetNodesReq
7177
7178
7179 void
scanFrag_start(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)7180 Dbspj::scanFrag_start(Signal* signal,
7181 Ptr<Request> requestPtr,
7182 Ptr<TreeNode> treeNodePtr)
7183 {
7184 jam();
7185 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7186
7187 ndbassert(data.m_fragCount > 0);
7188 ndbassert(data.m_frags_outstanding == 0);
7189 ndbassert(data.m_frags_complete == 0);
7190 data.m_frags_not_started = data.m_fragCount;
7191
7192 ndbassert(treeNodePtr.p->m_state == TreeNode::TN_BUILDING);
7193 treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
7194
7195 scanFrag_send(signal, requestPtr, treeNodePtr);
7196 }//Dbspj::scanFrag_start
7197
7198 Uint32
scanFrag_findFrag(Local_ScanFragHandle_list & list,Ptr<ScanFragHandle> & fragPtr,Uint32 fragId)7199 Dbspj::scanFrag_findFrag(Local_ScanFragHandle_list & list,
7200 Ptr<ScanFragHandle> & fragPtr, Uint32 fragId)
7201 {
7202 for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
7203 {
7204 jam();
7205 if (fragPtr.p->m_fragId == fragId)
7206 {
7207 jam();
7208 return 0;
7209 }
7210 }
7211
7212 return DbspjErr::IndexFragNotFound;
7213 }
7214
7215 void
scanFrag_parent_row(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const RowPtr & rowRef)7216 Dbspj::scanFrag_parent_row(Signal* signal,
7217 Ptr<Request> requestPtr,
7218 Ptr<TreeNode> treeNodePtr,
7219 const RowPtr & rowRef)
7220 {
7221 jam();
7222 ndbassert(treeNodePtr.p->m_parentPtrI != RNIL);
7223 DEBUG("::scanFrag_parent_row"
7224 << ", node: " << treeNodePtr.p->m_node_no);
7225
7226 Uint32 err;
7227 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7228
7229 /**
7230 * Construct range definition,
7231 * and if prune pattern enabled
7232 * stuff it onto correct scanFrag
7233 */
7234 do
7235 {
7236 Ptr<ScanFragHandle> fragPtr;
7237 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
7238 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
7239
7240 err = checkTableError(treeNodePtr);
7241 if (unlikely(err != 0))
7242 {
7243 jam();
7244 break;
7245 }
7246
7247 if (treeNodePtr.p->m_bits & TreeNode::T_PRUNE_PATTERN)
7248 {
7249 jam();
7250
7251 /**
7252 * TODO: Expand into linear memory instead
7253 * of expanding into sections, and then copy
7254 * section into linear
7255 */
7256 Local_pattern_store pattern(pool, data.m_prunePattern);
7257 Uint32 pruneKeyPtrI = RNIL;
7258 bool hasNull;
7259 err = expand(pruneKeyPtrI, pattern, rowRef, hasNull);
7260 if (unlikely(err != 0))
7261 {
7262 jam();
7263 releaseSection(pruneKeyPtrI);
7264 break;
7265 }
7266
7267 if (unlikely(hasNull))
7268 {
7269 jam();
7270 DEBUG("T_PRUNE_PATTERN-key contain NULL values");
7271
7272 // Ignore this request as 'NULL == <column>' will never give a match
7273 releaseSection(pruneKeyPtrI);
7274 return; // Bailout, SCANREQ would have returned 0 rows anyway
7275 }
7276
7277 BuildKeyReq tmp;
7278 Uint32 tableId = treeNodePtr.p->m_primaryTableId;
7279 err = computePartitionHash(signal, tmp, tableId, pruneKeyPtrI);
7280 releaseSection(pruneKeyPtrI);
7281 if (unlikely(err != 0))
7282 {
7283 jam();
7284 break;
7285 }
7286
7287 err = getNodes(signal, tmp, tableId);
7288 if (unlikely(err != 0))
7289 {
7290 jam();
7291 break;
7292 }
7293
7294 err = scanFrag_findFrag(list, fragPtr, tmp.fragId);
7295 if (unlikely(err != 0))
7296 {
7297 DEBUG_CRASH();
7298 break;
7299 }
7300
7301 /**
7302 * NOTE: We can get different receiverRef's here
7303 * for different keys. E.g during node-recovery where
7304 * primary-fragment is switched.
7305 *
7306 * Use latest that we receive
7307 *
7308 * TODO: Also double check table-reorg
7309 */
7310 fragPtr.p->m_ref = tmp.receiverRef;
7311 }
7312 else
7313 {
7314 jam();
7315 /**
7316 * If const prune, or no-prune, store on first fragment,
7317 * and send to 1 or all resp.
7318 */
7319 list.first(fragPtr);
7320 }
7321
7322 if (treeNodePtr.p->m_bits & TreeNode::T_KEYINFO_CONSTRUCTED)
7323 {
7324 jam();
7325 Local_pattern_store pattern(pool, treeNodePtr.p->m_keyPattern);
7326
7327 /**
7328 * Test execution terminated due to 'OutOfSectionMemory':
7329 * - 17060: Fail on scanFrag_parent_row at first call
7330 * - 17061: Fail on scanFrag_parent_row if 'isLeaf'
7331 * - 17062: Fail on scanFrag_parent_row if treeNode not root
7332 * - 17063: Fail on scanFrag_parent_row at a random node of the query tree
7333 */
7334 if (ERROR_INSERTED(17060) ||
7335 (ERROR_INSERTED(17061) && (treeNodePtr.p->isLeaf())) ||
7336 (ERROR_INSERTED(17062) && (treeNodePtr.p->m_parentPtrI != RNIL)) ||
7337 (ERROR_INSERTED(17063) && (rand() % 7) == 0))
7338 {
7339 jam();
7340 CLEAR_ERROR_INSERT_VALUE;
7341 ndbout_c("Injecting OutOfSectionMemory error at line %d file %s",
7342 __LINE__, __FILE__);
7343 err = DbspjErr::OutOfSectionMemory;
7344 break;
7345 }
7346
7347 bool hasNull = false;
7348 Uint32 keyPtrI = RNIL;
7349 err = expand(keyPtrI, pattern, rowRef, hasNull);
7350 if (unlikely(err != 0))
7351 {
7352 jam();
7353 break;
7354 }
7355 if (hasNull)
7356 {
7357 jam();
7358 DEBUG("Key contain NULL values, ignoring it");
7359 DBUG_ASSERT((treeNodePtr.p->m_bits & TreeNode::T_ONE_SHOT) == 0);
7360 // Ignore this request as 'NULL == <column>' will never give a match
7361 releaseSection(keyPtrI);
7362 return; // Bailout, SCANREQ would have returned 0 rows anyway
7363 }
7364 scanFrag_fixupBound(fragPtr, keyPtrI, rowRef.m_src_correlation);
7365
7366 SectionReader key(keyPtrI, getSectionSegmentPool());
7367 err = appendReaderToSection(fragPtr.p->m_rangePtrI, key, key.getSize());
7368 releaseSection(keyPtrI);
7369 if (unlikely(err != 0))
7370 {
7371 jam();
7372 break;
7373 }
7374 }
7375 else
7376 {
7377 jam();
7378 // Fixed key...fix later...
7379 ndbabort();
7380 }
7381
7382 if (treeNodePtr.p->m_bits & TreeNode::T_ONE_SHOT)
7383 {
7384 jam();
7385 /**
7386 * We being a T_ONE_SHOT means that we're only be called
7387 * with parent_row once, i.e batch is complete
7388 */
7389 scanFrag_parent_batch_complete(signal, requestPtr, treeNodePtr);
7390 }
7391
7392 return;
7393 } while (0);
7394
7395 ndbrequire(err);
7396 jam();
7397 abort(signal, requestPtr, err);
7398 }
7399
7400
7401 void
scanFrag_fixupBound(Ptr<ScanFragHandle> fragPtr,Uint32 ptrI,Uint32 corrVal)7402 Dbspj::scanFrag_fixupBound(Ptr<ScanFragHandle> fragPtr,
7403 Uint32 ptrI, Uint32 corrVal)
7404 {
7405 /**
7406 * Index bounds...need special tender and care...
7407 *
7408 * 1) Set #bound no, bound-size, and renumber attributes
7409 */
7410 SectionReader r0(ptrI, getSectionSegmentPool());
7411 const Uint32 boundsz = r0.getSize();
7412
7413 Uint32 tmp;
7414 ndbrequire(r0.peekWord(&tmp));
7415 ndbassert((corrVal & 0xFFFF) < MaxCorrelationId);
7416 tmp |= (boundsz << 16) | ((corrVal & 0xFFF) << 4);
7417 ndbrequire(r0.updateWord(tmp));
7418 ndbrequire(r0.step(1)); // Skip first BoundType
7419
7420 // Note: Renumbering below assume there are only EQ-bounds !!
7421 Uint32 id = 0;
7422 Uint32 len32;
7423 do
7424 {
7425 ndbrequire(r0.peekWord(&tmp));
7426 AttributeHeader ah(tmp);
7427 const Uint32 len = ah.getByteSize();
7428 AttributeHeader::init(&tmp, id++, len);
7429 ndbrequire(r0.updateWord(tmp));
7430 len32 = (len + 3) >> 2;
7431 } while (r0.step(2 + len32)); // Skip AttributeHeader(1) + Attribute(len32) + next BoundType(1)
7432 }
7433
7434 void
scanFrag_parent_batch_complete(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)7435 Dbspj::scanFrag_parent_batch_complete(Signal* signal,
7436 Ptr<Request> requestPtr,
7437 Ptr<TreeNode> treeNodePtr)
7438 {
7439 jam();
7440 ndbassert(treeNodePtr.p->m_parentPtrI != RNIL);
7441 ndbassert(treeNodePtr.p->m_state == TreeNode::TN_INACTIVE);
7442
7443 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7444 ndbassert(data.m_frags_complete == data.m_fragCount);
7445
7446 /**
7447 * Update the fragments 'm_state' and the aggregated TreeNode::m_frag_*
7448 * counters to reflect which fragments we should now start scanning.
7449 * NOTE: 'm_state' is not maintained if all 'complete' - node becomes
7450 * inactive
7451 */
7452 {
7453 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
7454 Ptr<ScanFragHandle> fragPtr;
7455 list.first(fragPtr);
7456 data.m_frags_complete = 0;
7457
7458 if ((treeNodePtr.p->m_bits & TreeNode::T_PRUNE_PATTERN) == 0)
7459 {
7460 /* No pruning, first fragment in list contains any range info */
7461 if (fragPtr.p->m_rangePtrI != RNIL)
7462 {
7463 /* All fragments to be scanned with range info */
7464 while(!fragPtr.isNull())
7465 {
7466 ndbassert(fragPtr.p->m_state == ScanFragHandle::SFH_NOT_STARTED ||
7467 fragPtr.p->m_state == ScanFragHandle::SFH_COMPLETE);
7468 fragPtr.p->m_state = ScanFragHandle::SFH_NOT_STARTED;
7469 list.next(fragPtr);
7470 }
7471 }
7472 else
7473 {
7474 /* No range info therefore empty result set. */
7475 jam();
7476 data.m_frags_complete = data.m_fragCount;
7477 }
7478 }
7479 else
7480 {
7481 /* Per fragment pruning, mark and count pruned-out
7482 * (rangeless) fragments as completed
7483 */
7484 while(!fragPtr.isNull())
7485 {
7486 fragPtr.p->m_state = ScanFragHandle::SFH_NOT_STARTED;
7487 if (fragPtr.p->m_rangePtrI == RNIL)
7488 {
7489 jam();
7490 /**
7491 * This is a pruned scan, so we only scan those fragments that
7492 * some distribution key hashed to.
7493 */
7494 fragPtr.p->m_state = ScanFragHandle::SFH_COMPLETE;
7495 data.m_frags_complete++;
7496 }
7497 list.next(fragPtr);
7498 }
7499 }
7500 data.m_frags_not_started = data.m_fragCount - data.m_frags_complete;
7501 }
7502
7503 if (data.m_frags_complete == data.m_fragCount)
7504 {
7505 jam();
7506 /**
7507 * No keys was produced...
7508 */
7509 return;
7510 }
7511
7512 /**
7513 * When parent's batch is complete, we send our batch
7514 */
7515 scanFrag_send(signal, requestPtr, treeNodePtr);
7516 }
7517
7518
7519 void
scanFrag_send(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)7520 Dbspj::scanFrag_send(Signal* signal,
7521 Ptr<Request> requestPtr,
7522 Ptr<TreeNode> treeNodePtr)
7523 {
7524 jam();
7525 ndbassert(treeNodePtr.p->m_state == TreeNode::TN_INACTIVE);
7526 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7527 ndbassert(data.m_frags_outstanding == 0);
7528 ndbassert(data.m_frags_not_started == (data.m_fragCount - data.m_frags_complete));
7529
7530 const ScanFragReq * org = (const ScanFragReq*)data.m_scanFragReq;
7531 ndbrequire(org->batch_size_rows > 0);
7532
7533 data.m_firstBatch = true;
7534 if (treeNodePtr.p->m_bits & TreeNode::T_SCAN_PARALLEL)
7535 {
7536 jam();
7537 data.m_parallelism = MIN(data.m_frags_not_started, org->batch_size_rows);
7538 }
7539 else if (!data.m_parallelismStat.isValid())
7540 {
7541 /**
7542 * No valid statistics yet to estimate 'parallism' from. We start
7543 * by reading a few fragments, but suffient many to take full advantage
7544 * of scan parallelism. Batch completion will provide a parallelism sample,
7545 * such that we can do a better parallelism guess next time.
7546 * Note that SCAN_FRAGCONF may start more scans when this scan completes,
7547 * if there are a sufficient amount of unused batch size left.
7548 */
7549 jam();
7550 data.m_parallelism = MIN(requestPtr.p->m_rootFragCnt,
7551 data.m_frags_not_started);
7552 }
7553 else
7554 {
7555 jam();
7556 /**
7557 * Use statistics from earlier runs of this operation to estimate the
7558 * initial parallelism. We use the mean minus two times the standard
7559 * deviation to have a low risk of setting parallelism to high (as erring
7560 * in the other direction is more costly).
7561 */
7562 Int32 parallelism =
7563 static_cast<Int32>(MIN(data.m_parallelismStat.getMean() +
7564 // Add 0.5 to get proper rounding.
7565 - 2 * data.m_parallelismStat.getStdDev() + 0.5,
7566 org->batch_size_rows));
7567
7568 if (parallelism < static_cast<Int32>(requestPtr.p->m_rootFragCnt))
7569 {
7570 jam();
7571 parallelism = MIN(requestPtr.p->m_rootFragCnt, data.m_frags_not_started);
7572 }
7573 else if (data.m_frags_not_started % parallelism != 0)
7574 {
7575 jam();
7576 /**
7577 * Set parallelism such that we can expect to have similar
7578 * parallelism in each batch. For example if there are 8 remaining
7579 * fragments, then we should fetch 2 times 4 fragments rather than
7580 * 7+1.
7581 */
7582 const Int32 roundTrips = 1 + data.m_frags_not_started / parallelism;
7583 parallelism = data.m_frags_not_started / roundTrips;
7584 }
7585
7586 // Allow higher parallelism to avoid 'rows' capped by MAX_PARALLEL_OP_PER_SCAN
7587 if ((org->batch_size_rows / parallelism) > MAX_PARALLEL_OP_PER_SCAN)
7588 {
7589 jam();
7590 parallelism = MIN((org->batch_size_rows + MAX_PARALLEL_OP_PER_SCAN-1)
7591 / MAX_PARALLEL_OP_PER_SCAN,
7592 data.m_frags_not_started);
7593 }
7594
7595 ndbassert(parallelism >= 1);
7596 ndbassert((Uint32)parallelism + data.m_frags_complete <= data.m_fragCount);
7597 data.m_parallelism = static_cast<Uint32>(parallelism);
7598
7599 #ifdef DEBUG_SCAN_FRAGREQ
7600 DEBUG("::scanFrag_send(), starting fragment scan with parallelism="
7601 << data.m_parallelism);
7602 #endif
7603 }
7604 ndbrequire(data.m_parallelism > 0);
7605
7606 // Cap batchSize-rows to avoid exceeding MAX_PARALLEL_OP_PER_SCAN
7607 const Uint32 bs_rows = MIN(org->batch_size_rows / data.m_parallelism,
7608 MAX_PARALLEL_OP_PER_SCAN);
7609 const Uint32 bs_bytes = org->batch_size_bytes / data.m_parallelism;
7610 ndbassert(bs_rows > 0);
7611 ndbassert(bs_bytes > 0);
7612
7613 data.m_rows_received = 0;
7614 data.m_rows_expecting = 0;
7615 data.m_largestBatchRows = 0;
7616 data.m_largestBatchBytes = 0;
7617 data.m_totalRows = 0;
7618 data.m_totalBytes = 0;
7619
7620 Uint32 batchRange = 0;
7621 Uint32 frags_started =
7622 scanFrag_send(signal,
7623 requestPtr,
7624 treeNodePtr,
7625 data.m_parallelism,
7626 bs_bytes,
7627 bs_rows,
7628 batchRange);
7629
7630 /**
7631 * scanFrag_send might fail to send (errors?):
7632 * Check that we really did send something before
7633 * updating outstanding & active.
7634 */
7635 if (likely(frags_started > 0))
7636 {
7637 jam();
7638 ndbrequire(static_cast<Uint32>(data.m_frags_outstanding +
7639 data.m_frags_complete) <=
7640 data.m_fragCount);
7641
7642 data.m_batch_chunks = 1;
7643 requestPtr.p->m_cnt_active++;
7644 requestPtr.p->m_outstanding++;
7645 requestPtr.p->m_completed_tree_nodes.clear(treeNodePtr.p->m_node_no);
7646 treeNodePtr.p->m_state = TreeNode::TN_ACTIVE;
7647 }
7648 }
7649
7650 /**
7651 * Ask for the first batch for a number of fragments.
7652 *
7653 * Returns how many fragments we did request the
7654 * 'first batch' from. (<= noOfFrags)
7655 */
7656 Uint32
scanFrag_send(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 noOfFrags,Uint32 bs_bytes,Uint32 bs_rows,Uint32 & batchRange)7657 Dbspj::scanFrag_send(Signal* signal,
7658 Ptr<Request> requestPtr,
7659 Ptr<TreeNode> treeNodePtr,
7660 Uint32 noOfFrags,
7661 Uint32 bs_bytes,
7662 Uint32 bs_rows,
7663 Uint32& batchRange)
7664 {
7665 jam();
7666 ndbassert(bs_bytes > 0);
7667 ndbassert(bs_rows > 0);
7668 ndbassert(bs_rows <= bs_bytes);
7669 /**
7670 * if (m_bits & prunemask):
7671 * - Range keys sliced out to each ScanFragHandle
7672 * - Else, range keys kept on first (and only) ScanFragHandle
7673 */
7674 const bool prune = treeNodePtr.p->m_bits &
7675 (TreeNode::T_PRUNE_PATTERN | TreeNode::T_CONST_PRUNE);
7676
7677 /**
7678 * If scan is repeatable, we must make sure not to release range keys so
7679 * that we can use them again in the next repetition.
7680 */
7681 const bool repeatable =
7682 (treeNodePtr.p->m_bits & TreeNode::T_SCAN_REPEATABLE) != 0;
7683
7684 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7685 ndbassert(noOfFrags > 0);
7686 ndbassert(noOfFrags <= data.m_frags_not_started);
7687 ScanFragReq* const req =
7688 reinterpret_cast<ScanFragReq*>(signal->getDataPtrSend());
7689 const ScanFragReq * const org
7690 = reinterpret_cast<ScanFragReq*>(data.m_scanFragReq);
7691
7692 memcpy(req, org, sizeof(data.m_scanFragReq));
7693 // req->variableData[0] // set below
7694 req->variableData[1] = requestPtr.p->m_rootResultData;
7695 req->batch_size_bytes = bs_bytes;
7696 req->batch_size_rows = MIN(bs_rows,MAX_PARALLEL_OP_PER_SCAN);
7697
7698 Uint32 requestsSent = 0;
7699 Uint32 err = checkTableError(treeNodePtr);
7700 if (likely(err == 0))
7701 {
7702 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
7703 Ptr<ScanFragHandle> fragPtr;
7704 list.first(fragPtr);
7705
7706 /**
7707 * Iterate over the list of fragments until we have sent as many
7708 * SCAN_FRAGREQs as we should.
7709 */
7710 while (requestsSent < noOfFrags)
7711 {
7712 jam();
7713 ndbassert(!fragPtr.isNull());
7714 /**
7715 * There is a 12-bit implementation limit on how large
7716 * the 'parent-row-correlation-id' may be. Thus, if rows
7717 * from this scan may be 'parents', number of rows in batch
7718 * should not exceed what could be represented in 12 bits.
7719 * See also Dbspj::scanFrag_fixupBound()
7720 */
7721 ndbassert(treeNodePtr.p->isLeaf() ||
7722 batchRange+bs_rows <= MaxCorrelationId);
7723
7724 if (fragPtr.p->m_state != ScanFragHandle::SFH_NOT_STARTED)
7725 {
7726 // Skip forward to the frags that we should send.
7727 jam();
7728 list.next(fragPtr);
7729 continue;
7730 }
7731
7732 const Uint32 ref = fragPtr.p->m_ref;
7733
7734 if (noOfFrags==1 && !prune &&
7735 data.m_frags_not_started == data.m_fragCount &&
7736 refToNode(ref) != getOwnNodeId() &&
7737 list.hasNext(fragPtr))
7738 {
7739 /**
7740 * If we are doing a scan with adaptive parallelism and start with
7741 * parallelism=1 then it makes sense to fetch a batch from a fragment on
7742 * the local data node. The reason for this is that if that fragment
7743 * contains few rows, we may be able to read from several fragments in
7744 * parallel. Then we minimize the total number of round trips (to remote
7745 * data nodes) if we fetch the first fragment batch locally.
7746 */
7747 jam();
7748 list.next(fragPtr);
7749 continue;
7750 }
7751
7752 /**
7753 * Set data specific for this fragment
7754 */
7755 req->senderData = fragPtr.i;
7756 req->fragmentNoKeyLen = fragPtr.p->m_fragId;
7757 req->variableData[0] = batchRange;
7758
7759 /**
7760 * Set up the key-/attrInfo to be sent with the SCAN_FRAGREQ.
7761 * Determine whether these should released as part of the
7762 * send or not. We try to 'release' whenever possible in order
7763 * to avoid copying them when sent locally. However, we need
7764 * to make sure that the key/attr will not be reused before
7765 * they can be released. Note:
7766 *
7767 * - Only the rootNode is ONE_SHOT.
7768 * - keyInfo comes from either m_send.m_keyInfoPtrI or
7769 * fragPtr.p->m_rangePtrI (not both! - 'XOR').
7770 * - If the child scan is pruned, a seperate 'rangePtr' is
7771 * build for each frag - Non-pruned scan store the 'rangePtr'
7772 * in the first frag, which is reused for all the frags.
7773 * - Child nodes can possibly be 'repeatable', which implies
7774 * that m_rangePtrI can't be released yet.
7775 * - attrInfo is always taken from m_send.m_attrInfoPtrI, and
7776 * is reused from all frag scans, either repeated or not!
7777 *
7778 * Note the somewhat different lifetime of key- vs attrInfo:
7779 * Except for the ONE_SHOT rootNode, the attrInfo always has
7780 * to be kept longer than 'key' before released.
7781 * As sendSignal() either release both or none, we can't
7782 * set 'releaseAtSend' to suite both key- and attrInfo
7783 * lifetime.
7784 *
7785 * Thus, we set 'releaseAtSend' to suite the shorter lifecycle
7786 * of the 'range' keys. attrInfo is duplicated whenever needed
7787 * such that a copy can be released together with the keyInfo.
7788 */
7789 Uint32 attrInfoPtrI = treeNodePtr.p->m_send.m_attrInfoPtrI;
7790 Uint32 keyInfoPtrI = treeNodePtr.p->m_send.m_keyInfoPtrI;
7791 bool releaseAtSend = false;
7792
7793 if (treeNodePtr.p->m_bits & TreeNode::T_ONE_SHOT &&
7794 data.m_frags_not_started==1)
7795 {
7796 jam();
7797 ndbassert(!repeatable);
7798 ndbassert(fragPtr.p->m_rangePtrI == RNIL);
7799 /**
7800 * Pass sections to send and release them (root only)
7801 */
7802 treeNodePtr.p->m_send.m_attrInfoPtrI = RNIL;
7803 treeNodePtr.p->m_send.m_keyInfoPtrI = RNIL;
7804 releaseAtSend = true;
7805 }
7806 else
7807 {
7808 jam();
7809 Ptr<ScanFragHandle> fragWithRangePtr;
7810 if (prune)
7811 {
7812 jam();
7813 fragWithRangePtr = fragPtr;
7814 releaseAtSend = !repeatable;
7815 }
7816 else
7817 {
7818 /**
7819 * Note: if not 'prune', keyInfo is only set in first fragPtr,
7820 * even if it is valid for all of them. (save some mem.)
7821 */
7822 jam();
7823 list.first(fragWithRangePtr);
7824 releaseAtSend = (!repeatable && data.m_frags_not_started==1);
7825 }
7826 if (fragWithRangePtr.p->m_rangePtrI != RNIL)
7827 {
7828 ndbassert(keyInfoPtrI == RNIL); //Not both keyInfo and 'range'
7829 keyInfoPtrI = fragWithRangePtr.p->m_rangePtrI;
7830 }
7831 /**
7832 * 'releaseAtSend' is set above based on the keyInfo lifetime.
7833 * Copy the attrInfo (comment above) whenever needed.
7834 */
7835 if (releaseAtSend)
7836 {
7837 jam();
7838 /**
7839 * Test execution terminated due to 'OutOfSectionMemory' which
7840 * may happen for different treeNodes in the request:
7841 * - 17090: Fail on any scanFrag_send()
7842 * - 17091: Fail after sending SCAN_FRAGREQ to some fragments
7843 * - 17092: Fail on scanFrag_send() if 'isLeaf'
7844 * - 17093: Fail on scanFrag_send() if treeNode not root
7845 */
7846 if (ERROR_INSERTED(17090) ||
7847 (ERROR_INSERTED(17091) && requestsSent > 1) ||
7848 (ERROR_INSERTED(17092) && treeNodePtr.p->isLeaf()) ||
7849 (ERROR_INSERTED(17093) && treeNodePtr.p->m_parentPtrI != RNIL))
7850 {
7851 jam();
7852 CLEAR_ERROR_INSERT_VALUE;
7853 ndbout_c("Injecting OutOfSectionMemory error at line %d file %s",
7854 __LINE__, __FILE__);
7855 err = DbspjErr::OutOfSectionMemory;
7856 break;
7857 }
7858 Uint32 tmp = RNIL;
7859 if (!dupSection(tmp, attrInfoPtrI))
7860 {
7861 jam();
7862 ndbassert(tmp == RNIL); // Guard for memleak
7863 err = DbspjErr::OutOfSectionMemory;
7864 break;
7865 }
7866 attrInfoPtrI = tmp;
7867
7868 /** Reflect the release of the keyInfo 'range' set above */
7869 fragWithRangePtr.p->m_rangePtrI = RNIL;
7870 } //if (releaseAtSend)
7871 }
7872
7873 SectionHandle handle(this);
7874 getSection(handle.m_ptr[0], attrInfoPtrI);
7875 handle.m_cnt = 1;
7876 if (keyInfoPtrI != RNIL)
7877 {
7878 jam();
7879 getSection(handle.m_ptr[1], keyInfoPtrI);
7880 handle.m_cnt++;
7881 }
7882
7883 #if defined DEBUG_SCAN_FRAGREQ
7884 ndbout_c("SCAN_FRAGREQ to %x", ref);
7885 printSCAN_FRAGREQ(stdout, signal->getDataPtrSend(),
7886 NDB_ARRAY_SIZE(treeNodePtr.p->m_scanFrag_data.m_scanFragReq),
7887 DBLQH);
7888 printf("ATTRINFO: ");
7889 print(handle.m_ptr[0], stdout);
7890 if (handle.m_cnt > 1)
7891 {
7892 printf("KEYINFO: ");
7893 print(handle.m_ptr[1], stdout);
7894 }
7895 #endif
7896
7897 if (!ScanFragReq::getRangeScanFlag(req->requestInfo))
7898 {
7899 c_Counters.incr_counter(CI_LOCAL_TABLE_SCANS_SENT, 1);
7900 }
7901 else if (refToNode(ref) == getOwnNodeId())
7902 {
7903 c_Counters.incr_counter(CI_LOCAL_RANGE_SCANS_SENT, 1);
7904 }
7905 else
7906 {
7907 ndbrequire(!ERROR_INSERTED(17014));
7908 c_Counters.incr_counter(CI_REMOTE_RANGE_SCANS_SENT, 1);
7909 }
7910
7911 /**
7912 * For a non-repeatable pruned scan, key info is unique for each
7913 * fragment and therefore cannot be reused, so we release key info
7914 * right away.
7915 */
7916
7917 if (ERROR_INSERTED(17110) ||
7918 (ERROR_INSERTED(17111) && treeNodePtr.p->isLeaf()) ||
7919 (ERROR_INSERTED(17112) && treeNodePtr.p->m_parentPtrI != RNIL))
7920 {
7921 jam();
7922 CLEAR_ERROR_INSERT_VALUE;
7923 ndbout_c("Injecting invalid schema version error at line %d file %s",
7924 __LINE__, __FILE__);
7925 // Provoke 'Invalid schema version' in order to receive SCAN_FRAGREF
7926 req->schemaVersion++;
7927 }
7928
7929 /**
7930 * To reduce the copy burden we want to keep hold of the
7931 * AttrInfo and KeyInfo sections after sending them to
7932 * LQH. To do this we perform the fragmented send inline,
7933 * so that all fragments are sent *now*. This avoids any
7934 * problems with the fragmented send CONTINUE 'thread' using
7935 * the section while we hold or even release it. The
7936 * signal receiver can still take realtime breaks when
7937 * receiving.
7938 *
7939 * Indicate to sendBatchedFragmentedSignal that we want to
7940 * keep the fragments, so it must not free them, unless this
7941 * is the last request in which case they can be freed. If
7942 * the last request is a local send then a copy is avoided.
7943 */
7944 {
7945 jam();
7946 sendBatchedFragmentedSignal(ref,
7947 GSN_SCAN_FRAGREQ,
7948 signal,
7949 NDB_ARRAY_SIZE(data.m_scanFragReq),
7950 JBB,
7951 &handle,
7952 !releaseAtSend); //Keep sent sections,
7953 //unless last send
7954 }
7955
7956 if (releaseAtSend)
7957 {
7958 ndbassert(handle.m_cnt == 0);
7959 }
7960 handle.clear();
7961
7962 fragPtr.p->m_state = ScanFragHandle::SFH_SCANNING; // running
7963 data.m_frags_outstanding++;
7964 data.m_frags_not_started--;
7965 batchRange += bs_rows;
7966 requestsSent++;
7967 list.next(fragPtr);
7968 } // while (requestsSent < noOfFrags)
7969 }
7970 if (err)
7971 {
7972 jam();
7973 abort(signal, requestPtr, err);
7974 }
7975
7976 return requestsSent;
7977 }
7978
7979 void
scanFrag_parent_batch_repeat(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)7980 Dbspj::scanFrag_parent_batch_repeat(Signal* signal,
7981 Ptr<Request> requestPtr,
7982 Ptr<TreeNode> treeNodePtr)
7983 {
7984 jam();
7985 ndbassert(treeNodePtr.p->m_parentPtrI != RNIL);
7986
7987 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
7988
7989 DEBUG("scanFrag_parent_batch_repeat(), m_node_no: " << treeNodePtr.p->m_node_no
7990 << ", m_batch_chunks: " << data.m_batch_chunks);
7991
7992 ndbassert(treeNodePtr.p->m_bits & TreeNode::T_SCAN_REPEATABLE);
7993
7994 /**
7995 * Register fragment-scans to be restarted if we didn't get all
7996 * previously fetched parent related child rows in a single batch.
7997 */
7998 if (data.m_batch_chunks > 1)
7999 {
8000 jam();
8001 DEBUG("Register TreeNode for restart, m_node_no: " << treeNodePtr.p->m_node_no);
8002 ndbrequire(treeNodePtr.p->m_state != TreeNode::TN_ACTIVE);
8003 registerActiveCursor(requestPtr, treeNodePtr);
8004 data.m_batch_chunks = 0;
8005 }
8006 }
8007
8008 void
scanFrag_countSignal(const Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Uint32 cnt)8009 Dbspj::scanFrag_countSignal(const Signal* signal,
8010 Ptr<Request> requestPtr,
8011 Ptr<TreeNode> treeNodePtr,
8012 Uint32 cnt)
8013 {
8014 jam();
8015 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8016 data.m_rows_received += cnt;
8017
8018 if (data.m_frags_outstanding == 0 &&
8019 data.m_rows_received == data.m_rows_expecting)
8020 {
8021 jam();
8022 ndbassert(requestPtr.p->m_outstanding > 0);
8023 requestPtr.p->m_outstanding--;
8024
8025 // We have received all rows for this treeNode in this batch.
8026 requestPtr.p->m_completed_tree_nodes.set(treeNodePtr.p->m_node_no);
8027 }
8028 }
8029
8030 void
scanFrag_execSCAN_FRAGCONF(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Ptr<ScanFragHandle> fragPtr)8031 Dbspj::scanFrag_execSCAN_FRAGCONF(Signal* signal,
8032 Ptr<Request> requestPtr,
8033 Ptr<TreeNode> treeNodePtr,
8034 Ptr<ScanFragHandle> fragPtr)
8035 {
8036 jam();
8037
8038 const ScanFragConf * conf = (const ScanFragConf*)(signal->getDataPtr());
8039
8040 Uint32 rows = conf->completedOps;
8041 Uint32 done = conf->fragmentCompleted;
8042 Uint32 bytes = conf->total_len * sizeof(Uint32);
8043
8044 Uint32 state = fragPtr.p->m_state;
8045 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8046
8047 if (state == ScanFragHandle::SFH_WAIT_CLOSE && done == 0)
8048 {
8049 jam();
8050 /**
8051 * We sent an explicit close request...ignore this...a close will come later
8052 */
8053 return;
8054 }
8055
8056 requestPtr.p->m_rows += rows;
8057 data.m_totalRows += rows;
8058 data.m_totalBytes += bytes;
8059 data.m_largestBatchRows = MAX(data.m_largestBatchRows, rows);
8060 data.m_largestBatchBytes = MAX(data.m_largestBatchBytes, bytes);
8061
8062 if (treeNodePtr.p->m_bits & TreeNode::T_EXPECT_TRANSID_AI)
8063 {
8064 jam();
8065 data.m_rows_expecting += rows;
8066 }
8067
8068 ndbrequire(data.m_frags_outstanding);
8069 ndbrequire(state == ScanFragHandle::SFH_SCANNING ||
8070 state == ScanFragHandle::SFH_WAIT_CLOSE);
8071
8072 data.m_frags_outstanding--;
8073 fragPtr.p->m_state = ScanFragHandle::SFH_WAIT_NEXTREQ;
8074
8075 if (done)
8076 {
8077 jam();
8078 fragPtr.p->m_state = ScanFragHandle::SFH_COMPLETE;
8079 ndbrequire(data.m_frags_complete < data.m_fragCount);
8080 data.m_frags_complete++;
8081
8082 if (data.m_frags_complete == data.m_fragCount ||
8083 ((requestPtr.p->m_state & Request::RS_ABORTING) != 0 &&
8084 data.m_fragCount == (data.m_frags_complete + data.m_frags_not_started)))
8085 {
8086 jam();
8087 ndbrequire(requestPtr.p->m_cnt_active);
8088 requestPtr.p->m_cnt_active--;
8089 treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
8090 }
8091 }
8092
8093 if (data.m_frags_outstanding == 0)
8094 {
8095 const bool isFirstBatch = data.m_firstBatch;
8096 data.m_firstBatch = false;
8097
8098 const ScanFragReq * const org
8099 = reinterpret_cast<const ScanFragReq*>(data.m_scanFragReq);
8100
8101 if (data.m_frags_complete == data.m_fragCount)
8102 {
8103 jam();
8104 /**
8105 * Calculate what would have been the optimal parallelism for the
8106 * scan instance that we have just completed, and update
8107 * 'parallelismStat' with this value. We then use this statistics to set
8108 * the initial parallelism for the next instance of this operation.
8109 */
8110 double parallelism = data.m_fragCount;
8111 if (data.m_totalRows > 0)
8112 {
8113 parallelism = MIN(parallelism,
8114 double(org->batch_size_rows) * data.m_fragCount
8115 / data.m_totalRows);
8116 }
8117 if (data.m_totalBytes > 0)
8118 {
8119 parallelism = MIN(parallelism,
8120 double(org->batch_size_bytes) * data.m_fragCount
8121 / data.m_totalBytes);
8122 }
8123 data.m_parallelismStat.sample(parallelism);
8124 }
8125
8126 /**
8127 * Don't continue scan if we're aborting...
8128 */
8129 ndbassert(state != ScanFragHandle::SFH_WAIT_CLOSE ||
8130 (requestPtr.p->m_state & Request::RS_ABORTING));
8131
8132 if (state == ScanFragHandle::SFH_SCANNING &&
8133 isFirstBatch && data.m_frags_not_started > 0)
8134 {
8135 jam();
8136 /**
8137 * Check if we can expect to be able to fetch the entire result set by
8138 * asking for more fragments within the same batch. This may improve
8139 * performance for bushy scans, as subsequent bushy branches must be
8140 * re-executed for each batch of this scan.
8141 */
8142
8143 /**
8144 * Find the maximal correlation value that we may have seen so far.
8145 * Correlation value must be unique within batch and smaller than
8146 * org->batch_size_rows.
8147 */
8148 const Uint32 maxCorrVal = (data.m_totalRows == 0) ? 0 :
8149 ((org->batch_size_rows / data.m_parallelism) * (data.m_parallelism - 1))
8150 + data.m_totalRows;
8151
8152 // Number of rows & bytes that we can still fetch in this batch.
8153 const Int32 remainingRows
8154 = static_cast<Int32>(org->batch_size_rows - maxCorrVal);
8155 const Int32 remainingBytes
8156 = static_cast<Int32>(org->batch_size_bytes - data.m_totalBytes);
8157
8158 if (remainingRows >= data.m_frags_not_started &&
8159 remainingBytes >= data.m_frags_not_started &&
8160 /**
8161 * Check that (remaning row capacity)/(remaining fragments) is
8162 * greater or equal to (rows read so far)/(finished fragments).
8163 */
8164 remainingRows * static_cast<Int32>(data.m_parallelism) >=
8165 static_cast<Int32>(data.m_totalRows * data.m_frags_not_started) &&
8166 remainingBytes * static_cast<Int32>(data.m_parallelism) >=
8167 static_cast<Int32>(data.m_totalBytes * data.m_frags_not_started))
8168 {
8169 jam();
8170 Uint32 batchRange = maxCorrVal;
8171 Uint32 bs_rows = remainingRows / data.m_frags_not_started;
8172 Uint32 bs_bytes = remainingBytes / data.m_frags_not_started;
8173
8174 DEBUG("::scanFrag_execSCAN_FRAGCONF() first batch was not full."
8175 " Asking for new batches from " << data.m_frags_not_started <<
8176 " fragments with " <<
8177 bs_rows <<" rows and " <<
8178 bs_bytes << " bytes.");
8179
8180 if (unlikely(bs_rows > bs_bytes))
8181 bs_rows = bs_bytes;
8182
8183 Uint32 frags_started =
8184 scanFrag_send(signal,
8185 requestPtr,
8186 treeNodePtr,
8187 data.m_frags_not_started,
8188 bs_bytes,
8189 bs_rows,
8190 batchRange);
8191
8192 if (likely(frags_started > 0))
8193 return;
8194
8195 // Else: scanFrag_send() didn't send anything for some reason.
8196 // Need to continue into 'completion detection' below.
8197 jam();
8198 }
8199 } // if (isFirstBatch ...)
8200
8201 if (data.m_rows_received == data.m_rows_expecting ||
8202 state == ScanFragHandle::SFH_WAIT_CLOSE)
8203 {
8204 jam();
8205 ndbassert(requestPtr.p->m_outstanding > 0);
8206 requestPtr.p->m_outstanding--;
8207 requestPtr.p->m_completed_tree_nodes.set(treeNodePtr.p->m_node_no);
8208 handleTreeNodeComplete(signal, requestPtr, treeNodePtr);
8209 }
8210 } // if (data.m_frags_outstanding == 0)
8211 }
8212
8213 void
scanFrag_execSCAN_FRAGREF(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,Ptr<ScanFragHandle> fragPtr)8214 Dbspj::scanFrag_execSCAN_FRAGREF(Signal* signal,
8215 Ptr<Request> requestPtr,
8216 Ptr<TreeNode> treeNodePtr,
8217 Ptr<ScanFragHandle> fragPtr)
8218 {
8219 jam();
8220
8221 const ScanFragRef * rep = CAST_CONSTPTR(ScanFragRef, signal->getDataPtr());
8222 const Uint32 errCode = rep->errorCode;
8223
8224 Uint32 state = fragPtr.p->m_state;
8225 ndbrequire(state == ScanFragHandle::SFH_SCANNING ||
8226 state == ScanFragHandle::SFH_WAIT_CLOSE);
8227
8228 fragPtr.p->m_state = ScanFragHandle::SFH_COMPLETE;
8229
8230 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8231 ndbrequire(data.m_frags_complete < data.m_fragCount);
8232 data.m_frags_complete++;
8233 ndbrequire(data.m_frags_outstanding > 0);
8234 data.m_frags_outstanding--;
8235
8236 if (data.m_fragCount == (data.m_frags_complete + data.m_frags_not_started))
8237 {
8238 jam();
8239 ndbrequire(requestPtr.p->m_cnt_active);
8240 requestPtr.p->m_cnt_active--;
8241 treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
8242 }
8243
8244 if (data.m_frags_outstanding == 0)
8245 {
8246 jam();
8247 ndbrequire(requestPtr.p->m_outstanding);
8248 requestPtr.p->m_outstanding--;
8249 }
8250
8251 abort(signal, requestPtr, errCode);
8252 }
8253
8254 void
scanFrag_execSCAN_NEXTREQ(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8255 Dbspj::scanFrag_execSCAN_NEXTREQ(Signal* signal,
8256 Ptr<Request> requestPtr,
8257 Ptr<TreeNode> treeNodePtr)
8258 {
8259 jam();
8260 Uint32 err = checkTableError(treeNodePtr);
8261 if (unlikely(err))
8262 {
8263 jam();
8264 abort(signal, requestPtr, err);
8265 return;
8266 }
8267
8268 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8269 const ScanFragReq * org = (const ScanFragReq*)data.m_scanFragReq;
8270
8271 data.m_rows_received = 0;
8272 data.m_rows_expecting = 0;
8273 ndbassert(data.m_frags_outstanding == 0);
8274
8275 ndbrequire(data.m_frags_complete < data.m_fragCount);
8276 if ((treeNodePtr.p->m_bits & TreeNode::T_SCAN_PARALLEL) == 0)
8277 {
8278 jam();
8279 /**
8280 * Since fetching few but large batches is more efficient, we
8281 * set parallelism to the lowest value where we can still expect each
8282 * batch to be full.
8283 */
8284 if (data.m_largestBatchRows < org->batch_size_rows/data.m_parallelism &&
8285 data.m_largestBatchBytes < org->batch_size_bytes/data.m_parallelism)
8286 {
8287 jam();
8288 data.m_parallelism = MIN(data.m_fragCount - data.m_frags_complete,
8289 org->batch_size_rows);
8290 if (data.m_largestBatchRows > 0)
8291 {
8292 jam();
8293 data.m_parallelism =
8294 MIN(org->batch_size_rows / data.m_largestBatchRows,
8295 data.m_parallelism);
8296 }
8297 if (data.m_largestBatchBytes > 0)
8298 {
8299 jam();
8300 data.m_parallelism =
8301 MIN(data.m_parallelism,
8302 org->batch_size_bytes/data.m_largestBatchBytes);
8303 }
8304 if (data.m_frags_complete == 0 &&
8305 data.m_frags_not_started % data.m_parallelism != 0)
8306 {
8307 jam();
8308 /**
8309 * Set parallelism such that we can expect to have similar
8310 * parallelism in each batch. For example if there are 8 remaining
8311 * fragments, then we should fetch 2 times 4 fragments rather than
8312 * 7+1.
8313 */
8314 const Uint32 roundTrips =
8315 1 + data.m_frags_not_started / data.m_parallelism;
8316 data.m_parallelism = data.m_frags_not_started / roundTrips;
8317 }
8318 }
8319 else
8320 {
8321 jam();
8322 // We get full batches, so we should lower parallelism.
8323 data.m_parallelism = MIN(data.m_fragCount - data.m_frags_complete,
8324 MAX(1, data.m_parallelism/2));
8325 }
8326 if (data.m_parallelism < requestPtr.p->m_rootFragCnt)
8327 {
8328 // Avoid starting so few scans that some LDM-threads are sitting idle
8329 data.m_parallelism = MIN(data.m_fragCount - data.m_frags_complete,
8330 requestPtr.p->m_rootFragCnt);
8331 }
8332 ndbassert(data.m_parallelism > 0);
8333 #ifdef DEBUG_SCAN_FRAGREQ
8334 DEBUG("::scanFrag_execSCAN_NEXTREQ() Asking for new batches from " <<
8335 data.m_parallelism <<
8336 " fragments with " << org->batch_size_rows/data.m_parallelism <<
8337 " rows and " << org->batch_size_bytes/data.m_parallelism <<
8338 " bytes.");
8339 #endif
8340 }
8341 else // Max parallelism
8342 {
8343 jam();
8344 data.m_parallelism = MIN(data.m_fragCount - data.m_frags_complete,
8345 org->batch_size_rows);
8346 }
8347
8348 const Uint32 bs_rows = MIN(org->batch_size_rows/data.m_parallelism,
8349 MAX_PARALLEL_OP_PER_SCAN);
8350 ndbassert(bs_rows > 0);
8351 ScanFragNextReq* req =
8352 reinterpret_cast<ScanFragNextReq*>(signal->getDataPtrSend());
8353 req->requestInfo = 0;
8354 ScanFragNextReq::setCorrFactorFlag(req->requestInfo);
8355 req->transId1 = requestPtr.p->m_transId[0];
8356 req->transId2 = requestPtr.p->m_transId[1];
8357 req->batch_size_rows = bs_rows;
8358 req->batch_size_bytes = org->batch_size_bytes/data.m_parallelism;
8359
8360 Uint32 batchRange = 0;
8361 Ptr<ScanFragHandle> fragPtr;
8362 Uint32 sentFragCount = 0;
8363 {
8364 /**
8365 * First, ask for more data from fragments that are already started.
8366 */
8367 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8368 list.first(fragPtr);
8369 while (sentFragCount < data.m_parallelism && !fragPtr.isNull())
8370 {
8371 jam();
8372 ndbassert(fragPtr.p->m_state == ScanFragHandle::SFH_WAIT_NEXTREQ ||
8373 fragPtr.p->m_state == ScanFragHandle::SFH_COMPLETE ||
8374 fragPtr.p->m_state == ScanFragHandle::SFH_NOT_STARTED);
8375 if (fragPtr.p->m_state == ScanFragHandle::SFH_WAIT_NEXTREQ)
8376 {
8377 jam();
8378
8379 data.m_frags_outstanding++;
8380 req->variableData[0] = batchRange;
8381 fragPtr.p->m_state = ScanFragHandle::SFH_SCANNING;
8382 batchRange += bs_rows;
8383
8384 DEBUG("scanFrag_execSCAN_NEXTREQ to: " << hex
8385 << fragPtr.p->m_ref
8386 << ", m_node_no=" << treeNodePtr.p->m_node_no
8387 << ", senderData: " << req->senderData);
8388
8389 #ifdef DEBUG_SCAN_FRAGREQ
8390 printSCANFRAGNEXTREQ(stdout, &signal->theData[0],
8391 ScanFragNextReq::SignalLength + 1, DBLQH);
8392 #endif
8393
8394 req->senderData = fragPtr.i;
8395 sendSignal(fragPtr.p->m_ref, GSN_SCAN_NEXTREQ, signal,
8396 ScanFragNextReq::SignalLength + 1,
8397 JBB);
8398 sentFragCount++;
8399 }
8400 list.next(fragPtr);
8401 }
8402 }
8403
8404 Uint32 frags_started = 0;
8405 if (sentFragCount < data.m_parallelism)
8406 {
8407 /**
8408 * Then start new fragments until we reach data.m_parallelism.
8409 */
8410 jam();
8411 ndbassert(data.m_frags_not_started != 0);
8412 frags_started =
8413 scanFrag_send(signal,
8414 requestPtr,
8415 treeNodePtr,
8416 data.m_parallelism - sentFragCount,
8417 org->batch_size_bytes/data.m_parallelism,
8418 bs_rows,
8419 batchRange);
8420 }
8421 /**
8422 * sendSignal() or scanFrag_send() might have failed to send:
8423 * Check that we really did send something before
8424 * updating outstanding & active.
8425 */
8426 if (likely(sentFragCount+frags_started > 0))
8427 {
8428 jam();
8429 ndbrequire(data.m_batch_chunks > 0);
8430 data.m_batch_chunks++;
8431
8432 requestPtr.p->m_outstanding++;
8433 requestPtr.p->m_completed_tree_nodes.clear(treeNodePtr.p->m_node_no);
8434 ndbassert(treeNodePtr.p->m_state == TreeNode::TN_ACTIVE);
8435 }
8436 }
8437
8438 void
scanFrag_complete(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8439 Dbspj::scanFrag_complete(Signal* signal,
8440 Ptr<Request> requestPtr,
8441 Ptr<TreeNode> treeNodePtr)
8442 {
8443 jam();
8444 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8445 if (!data.m_fragments.isEmpty())
8446 {
8447 jam();
8448 DihScanTabCompleteRep* rep=(DihScanTabCompleteRep*)signal->getDataPtrSend();
8449 rep->tableId = treeNodePtr.p->m_tableOrIndexId;
8450 rep->scanCookie = data.m_scanCookie;
8451 rep->jamBufferPtr = jamBuffer();
8452
8453 EXECUTE_DIRECT_MT(DBDIH, GSN_DIH_SCAN_TAB_COMPLETE_REP,
8454 signal, DihScanTabCompleteRep::SignalLength, 0);
8455 }
8456 }
8457
8458 void
scanFrag_abort(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8459 Dbspj::scanFrag_abort(Signal* signal,
8460 Ptr<Request> requestPtr,
8461 Ptr<TreeNode> treeNodePtr)
8462 {
8463 jam();
8464
8465 switch(treeNodePtr.p->m_state){
8466 case TreeNode::TN_BUILDING:
8467 case TreeNode::TN_PREPARING:
8468 case TreeNode::TN_INACTIVE:
8469 case TreeNode::TN_COMPLETING:
8470 case TreeNode::TN_END:
8471 DEBUG("scanFrag_abort"
8472 << ", transId: " << hex << requestPtr.p->m_transId[0]
8473 << "," << hex << requestPtr.p->m_transId[1]
8474 << ", state: " << treeNodePtr.p->m_state);
8475 return;
8476
8477 case TreeNode::TN_ACTIVE:
8478 jam();
8479 break;
8480 }
8481
8482 ScanFragNextReq* req = CAST_PTR(ScanFragNextReq, signal->getDataPtrSend());
8483 req->requestInfo = 0;
8484 ScanFragNextReq::setCloseFlag(req->requestInfo, 1);
8485 req->transId1 = requestPtr.p->m_transId[0];
8486 req->transId2 = requestPtr.p->m_transId[1];
8487 req->batch_size_rows = 0;
8488 req->batch_size_bytes = 0;
8489
8490 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8491 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8492 Ptr<ScanFragHandle> fragPtr;
8493
8494 Uint32 cnt_waiting = 0;
8495 Uint32 cnt_scanning = 0;
8496 for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
8497 {
8498 switch(fragPtr.p->m_state){
8499 case ScanFragHandle::SFH_NOT_STARTED:
8500 case ScanFragHandle::SFH_COMPLETE:
8501 case ScanFragHandle::SFH_WAIT_CLOSE:
8502 jam();
8503 break;
8504 case ScanFragHandle::SFH_WAIT_NEXTREQ:
8505 jam();
8506 cnt_waiting++; // was idle...
8507 data.m_frags_outstanding++; // is closing
8508 goto do_abort;
8509 case ScanFragHandle::SFH_SCANNING:
8510 jam();
8511 cnt_scanning++;
8512 goto do_abort;
8513 do_abort:
8514 req->senderData = fragPtr.i;
8515 sendSignal(fragPtr.p->m_ref, GSN_SCAN_NEXTREQ, signal,
8516 ScanFragNextReq::SignalLength, JBB);
8517
8518 fragPtr.p->m_state = ScanFragHandle::SFH_WAIT_CLOSE;
8519 break;
8520 }
8521 }
8522
8523 if (cnt_scanning == 0)
8524 {
8525 if (cnt_waiting > 0)
8526 {
8527 /**
8528 * If all were waiting...this should increase m_outstanding
8529 */
8530 jam();
8531 requestPtr.p->m_outstanding++;
8532 }
8533 else
8534 {
8535 /**
8536 * All fragments are either complete or not yet started, so there is
8537 * nothing to abort.
8538 */
8539 jam();
8540 ndbassert(data.m_frags_not_started > 0);
8541 ndbrequire(requestPtr.p->m_cnt_active);
8542 requestPtr.p->m_cnt_active--;
8543 treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
8544 }
8545 }
8546 }
8547
8548 Uint32
scanFrag_execNODE_FAILREP(Signal * signal,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,const NdbNodeBitmask nodes)8549 Dbspj::scanFrag_execNODE_FAILREP(Signal* signal,
8550 Ptr<Request> requestPtr,
8551 Ptr<TreeNode> treeNodePtr,
8552 const NdbNodeBitmask nodes)
8553 {
8554 jam();
8555
8556 switch(treeNodePtr.p->m_state){
8557 case TreeNode::TN_PREPARING:
8558 case TreeNode::TN_INACTIVE:
8559 return 1;
8560
8561 case TreeNode::TN_BUILDING:
8562 case TreeNode::TN_COMPLETING:
8563 case TreeNode::TN_END:
8564 return 0;
8565
8566 case TreeNode::TN_ACTIVE:
8567 jam();
8568 break;
8569 }
8570
8571
8572 Uint32 sum = 0;
8573 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8574 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8575 Ptr<ScanFragHandle> fragPtr;
8576
8577 Uint32 save0 = data.m_frags_outstanding;
8578 Uint32 save1 = data.m_frags_complete;
8579
8580 for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
8581 {
8582 if (nodes.get(refToNode(fragPtr.p->m_ref)) == false)
8583 {
8584 jam();
8585 /**
8586 * No action needed
8587 */
8588 continue;
8589 }
8590
8591 switch(fragPtr.p->m_state){
8592 case ScanFragHandle::SFH_NOT_STARTED:
8593 jam();
8594 ndbrequire(data.m_frags_complete < data.m_fragCount);
8595 data.m_frags_complete++;
8596 ndbrequire(data.m_frags_not_started > 0);
8597 data.m_frags_not_started--;
8598 // fall through
8599 case ScanFragHandle::SFH_COMPLETE:
8600 jam();
8601 sum++; // indicate that we should abort
8602 /**
8603 * we could keep list of all fragments...
8604 * or execute DIGETNODES again...
8605 * but for now, we don't
8606 */
8607 break;
8608 case ScanFragHandle::SFH_WAIT_CLOSE:
8609 case ScanFragHandle::SFH_SCANNING:
8610 jam();
8611 ndbrequire(data.m_frags_outstanding > 0);
8612 data.m_frags_outstanding--;
8613 // fall through
8614 case ScanFragHandle::SFH_WAIT_NEXTREQ:
8615 jam();
8616 sum++;
8617 ndbrequire(data.m_frags_complete < data.m_fragCount);
8618 data.m_frags_complete++;
8619 break;
8620 }
8621 fragPtr.p->m_ref = 0;
8622 fragPtr.p->m_state = ScanFragHandle::SFH_COMPLETE;
8623 }
8624
8625 if (save0 != 0 && data.m_frags_outstanding == 0)
8626 {
8627 jam();
8628 ndbrequire(requestPtr.p->m_outstanding);
8629 requestPtr.p->m_outstanding--;
8630 }
8631
8632 if (save1 != 0 &&
8633 data.m_fragCount == (data.m_frags_complete + data.m_frags_not_started))
8634 {
8635 jam();
8636 ndbrequire(requestPtr.p->m_cnt_active);
8637 requestPtr.p->m_cnt_active--;
8638 treeNodePtr.p->m_state = TreeNode::TN_INACTIVE;
8639 }
8640
8641 return sum;
8642 }
8643
8644 void
scanFrag_release_rangekeys(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8645 Dbspj::scanFrag_release_rangekeys(Ptr<Request> requestPtr,
8646 Ptr<TreeNode> treeNodePtr)
8647 {
8648 jam();
8649 DEBUG("scanFrag_release_rangekeys(), tree node " << treeNodePtr.i
8650 << " m_node_no: " << treeNodePtr.p->m_node_no);
8651
8652 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8653 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8654 Ptr<ScanFragHandle> fragPtr;
8655
8656 if (treeNodePtr.p->m_bits & TreeNode::T_PRUNE_PATTERN)
8657 {
8658 jam();
8659 for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
8660 {
8661 if (fragPtr.p->m_rangePtrI != RNIL)
8662 {
8663 releaseSection(fragPtr.p->m_rangePtrI);
8664 fragPtr.p->m_rangePtrI = RNIL;
8665 }
8666 }
8667 }
8668 else
8669 {
8670 jam();
8671 if (!list.first(fragPtr))
8672 return;
8673 if (fragPtr.p->m_rangePtrI != RNIL)
8674 {
8675 releaseSection(fragPtr.p->m_rangePtrI);
8676 fragPtr.p->m_rangePtrI = RNIL;
8677 }
8678 }
8679 }
8680
8681 /**
8682 * Parent batch has completed, and will not refetch (X-joined) results
8683 * from its childs. Release & reset range keys which are unsent or we
8684 * have kept for possible resubmits.
8685 */
8686 void
scanFrag_parent_batch_cleanup(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8687 Dbspj::scanFrag_parent_batch_cleanup(Ptr<Request> requestPtr,
8688 Ptr<TreeNode> treeNodePtr)
8689 {
8690 DEBUG("scanFrag_parent_batch_cleanup");
8691 scanFrag_release_rangekeys(requestPtr,treeNodePtr);
8692 }
8693
8694 /**
8695 * Do final cleanup of specified TreeNode. There will be no
8696 * more (re-)execution of either this TreeNode nor other,
8697 * so no need to re-init for further execution.
8698 */
8699 void
scanFrag_cleanup(Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr)8700 Dbspj::scanFrag_cleanup(Ptr<Request> requestPtr,
8701 Ptr<TreeNode> treeNodePtr)
8702 {
8703 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8704 DEBUG("scanFrag_cleanup");
8705
8706 /**
8707 * Range keys has been collected wherever there are uncompleted
8708 * parent batches...release them to avoid memleak.
8709 */
8710 scanFrag_release_rangekeys(requestPtr,treeNodePtr);
8711
8712 if (treeNodePtr.p->m_bits & TreeNode::T_PRUNE_PATTERN)
8713 {
8714 jam();
8715 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
8716 Local_pattern_store pattern(pool, data.m_prunePattern);
8717 pattern.release();
8718 }
8719 else if (treeNodePtr.p->m_bits & TreeNode::T_CONST_PRUNE)
8720 {
8721 jam();
8722 if (data.m_constPrunePtrI != RNIL)
8723 {
8724 jam();
8725 releaseSection(data.m_constPrunePtrI);
8726 data.m_constPrunePtrI = RNIL;
8727 }
8728 }
8729
8730 cleanup_common(requestPtr, treeNodePtr);
8731 }
8732
8733
8734 bool
scanFrag_checkNode(const Ptr<Request> requestPtr,const Ptr<TreeNode> treeNodePtr)8735 Dbspj::scanFrag_checkNode(const Ptr<Request> requestPtr,
8736 const Ptr<TreeNode> treeNodePtr)
8737 {
8738 jam();
8739 if (treeNodePtr.p->m_state != TreeNode::TN_ACTIVE)
8740 {
8741 return true;
8742 }
8743
8744 bool checkResult = true;
8745
8746 {
8747 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8748 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8749 Ptr<ScanFragHandle> fragPtr;
8750
8751 Uint32 frags_not_started = 0;
8752 Uint32 frags_outstanding_scan = 0;
8753 Uint32 frags_outstanding_close = 0;
8754 Uint32 frags_waiting = 0;
8755 Uint32 frags_completed = 0;
8756
8757 Uint32 fragCount = 0;
8758
8759 for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
8760 {
8761 fragCount++;
8762 switch(fragPtr.p->m_state){
8763 case ScanFragHandle::SFH_NOT_STARTED:
8764 jam();
8765 frags_not_started++;
8766 break;
8767 case ScanFragHandle::SFH_SCANNING:
8768 jam();
8769 frags_outstanding_scan++;
8770 break;
8771 case ScanFragHandle::SFH_WAIT_CLOSE:
8772 jam();
8773 frags_outstanding_close++;
8774 break;
8775 case ScanFragHandle::SFH_WAIT_NEXTREQ:
8776 jam();
8777 frags_waiting++;
8778 break;
8779 case ScanFragHandle::SFH_COMPLETE:
8780 jam();
8781 frags_completed++;
8782 break;
8783 default:
8784 checkResult &= spjCheck(false);
8785 break;
8786 }
8787 }
8788
8789 /**
8790 * Compare counters to state, state must be valid
8791 * at all stable points in time for execNODE_FAILREP
8792 * handling
8793 */
8794 checkResult &= spjCheck(data.m_frags_not_started == frags_not_started);
8795 checkResult &= spjCheck(data.m_frags_outstanding ==
8796 (frags_outstanding_scan +
8797 frags_outstanding_close));
8798 checkResult &= spjCheck(data.m_frags_complete == frags_completed);
8799 }
8800
8801 return checkResult;
8802 }
8803
8804
8805 void
scanFrag_dumpNode(const Ptr<Request> requestPtr,const Ptr<TreeNode> treeNodePtr)8806 Dbspj::scanFrag_dumpNode(const Ptr<Request> requestPtr,
8807 const Ptr<TreeNode> treeNodePtr)
8808 {
8809 jam();
8810
8811 /* Non const ref due to list iteration below */
8812 ScanFragData& data = treeNodePtr.p->m_scanFrag_data;
8813
8814 g_eventLogger->info("DBSPJ %u : ScanFrag fragCount %u frags_complete %u "
8815 "frags_outstanding %u frags_not_started %u ",
8816 instance(),
8817 data.m_fragCount,
8818 data.m_frags_complete,
8819 data.m_frags_outstanding,
8820 data.m_frags_not_started);
8821 g_eventLogger->info("DBSPJ %u : parallelism %u rows_expecting %u "
8822 "rows_received %u firstBatch %u",
8823 instance(),
8824 data.m_parallelism,
8825 data.m_rows_expecting,
8826 data.m_rows_received,
8827 data.m_firstBatch);
8828 g_eventLogger->info("DBSPJ %u : totalRows %u totalBytes %u "
8829 "constPrunePtrI %u",
8830 instance(),
8831 data.m_totalRows,
8832 data.m_totalBytes,
8833 data.m_constPrunePtrI);
8834 {
8835 Local_ScanFragHandle_list list(m_scanfraghandle_pool, data.m_fragments);
8836 Ptr<ScanFragHandle> fragPtr;
8837 for (list.first(fragPtr); !fragPtr.isNull(); list.next(fragPtr))
8838 {
8839 dumpScanFragHandle(fragPtr);
8840 }
8841 }
8842 }
8843
8844 /**
8845 * END - MODULE SCAN FRAGMENT
8846 */
8847
8848 /**
8849 * Static OpInfo handling
8850 */
8851 const Dbspj::OpInfo*
getOpInfo(Uint32 op)8852 Dbspj::getOpInfo(Uint32 op)
8853 {
8854 DEBUG("getOpInfo(" << op << ")");
8855 switch(op){
8856 case QueryNode::QN_LOOKUP:
8857 return &Dbspj::g_LookupOpInfo;
8858 case QueryNode::QN_SCAN_FRAG_v1:
8859 return NULL; //Deprecated, converted into QN_SCAN_FRAG
8860 case QueryNode::QN_SCAN_INDEX_v1:
8861 return NULL; //Deprecated, converted into QN_SCAN_FRAG
8862 case QueryNode::QN_SCAN_FRAG:
8863 return &Dbspj::g_ScanFragOpInfo;
8864 default:
8865 return 0;
8866 }
8867 }
8868
8869 /**
8870 * MODULE COMMON PARSE/UNPACK
8871 */
8872
8873 /**
8874 * @returns dstLen + 1 on error
8875 */
8876 static
8877 Uint32
unpackList(Uint32 dstLen,Uint32 * dst,Dbspj::DABuffer & buffer)8878 unpackList(Uint32 dstLen, Uint32 * dst, Dbspj::DABuffer & buffer)
8879 {
8880 const Uint32 * ptr = buffer.ptr;
8881 if (likely(ptr != buffer.end))
8882 {
8883 Uint32 tmp = * ptr++;
8884 Uint32 cnt = tmp & 0xFFFF;
8885
8886 * dst ++ = (tmp >> 16); // Store first
8887 DEBUG("cnt: " << cnt << " first: " << (tmp >> 16));
8888
8889 if (cnt > 1)
8890 {
8891 Uint32 len = cnt / 2;
8892 if (unlikely(cnt >= dstLen || (ptr + len > buffer.end)))
8893 goto error;
8894
8895 cnt --; // subtract item stored in header
8896
8897 for (Uint32 i = 0; i < cnt/2; i++)
8898 {
8899 * dst++ = (* ptr) & 0xFFFF;
8900 * dst++ = (* ptr) >> 16;
8901 ptr++;
8902 }
8903
8904 if (cnt & 1)
8905 {
8906 * dst ++ = * ptr & 0xFFFF;
8907 ptr++;
8908 }
8909
8910 cnt ++; // readd item stored in header
8911 }
8912 buffer.ptr = ptr;
8913 return cnt;
8914 }
8915 return 0;
8916
8917 error:
8918 return dstLen + 1;
8919 }
8920
8921 /**
8922 * This function takes an array of attrinfo, and builds "header"
8923 * which can be used to do random access inside the row
8924 */
8925 Uint32
buildRowHeader(RowPtr::Header * header,SegmentedSectionPtr ptr)8926 Dbspj::buildRowHeader(RowPtr::Header * header, SegmentedSectionPtr ptr)
8927 {
8928 Uint32 tmp, len;
8929 Uint32 * dst = header->m_offset;
8930 const Uint32 * const save = dst;
8931 SectionReader r0(ptr, getSectionSegmentPool());
8932 Uint32 offset = 0;
8933 do
8934 {
8935 * dst++ = offset;
8936 r0.getWord(&tmp);
8937 len = AttributeHeader::getDataSize(tmp);
8938 offset += 1 + len;
8939 } while (r0.step(len));
8940
8941 return header->m_len = static_cast<Uint32>(dst - save);
8942 }
8943
8944 /**
8945 * This function takes an array of attrinfo, and builds "header"
8946 * which can be used to do random access inside the row
8947 */
8948 Uint32
buildRowHeader(RowPtr::Header * header,const Uint32 * & src,Uint32 len)8949 Dbspj::buildRowHeader(RowPtr::Header * header, const Uint32 *& src, Uint32 len)
8950 {
8951 Uint32 * dst = header->m_offset;
8952 const Uint32 * save = dst;
8953 Uint32 offset = 0;
8954 for (Uint32 i = 0; i<len; i++)
8955 {
8956 * dst ++ = offset;
8957 Uint32 tmp = * src++;
8958 Uint32 tmp_len = AttributeHeader::getDataSize(tmp);
8959 offset += 1 + tmp_len;
8960 src += tmp_len;
8961 }
8962
8963 return header->m_len = static_cast<Uint32>(dst - save);
8964 }
8965
8966 Uint32
appendToPattern(Local_pattern_store & pattern,DABuffer & tree,Uint32 len)8967 Dbspj::appendToPattern(Local_pattern_store & pattern,
8968 DABuffer & tree, Uint32 len)
8969 {
8970 jam();
8971 if (unlikely(tree.ptr + len > tree.end))
8972 return DbspjErr::InvalidTreeNodeSpecification;
8973
8974 if (ERROR_INSERTED_CLEAR(17008))
8975 {
8976 ndbout_c("Injecting OutOfQueryMemory error 17008 at line %d file %s",
8977 __LINE__, __FILE__);
8978 jam();
8979 return DbspjErr::OutOfQueryMemory;
8980 }
8981 if (unlikely(pattern.append(tree.ptr, len)==0))
8982 return DbspjErr::OutOfQueryMemory;
8983
8984 tree.ptr += len;
8985 return 0;
8986 }
8987
8988 Uint32
appendParamToPattern(Local_pattern_store & dst,const RowPtr::Linear & row,Uint32 col)8989 Dbspj::appendParamToPattern(Local_pattern_store& dst,
8990 const RowPtr::Linear & row, Uint32 col)
8991 {
8992 jam();
8993 Uint32 offset = row.m_header->m_offset[col];
8994 const Uint32 * ptr = row.m_data + offset;
8995 Uint32 len = AttributeHeader::getDataSize(* ptr ++);
8996 /* Param COL's converted to DATA when appended to pattern */
8997 Uint32 info = QueryPattern::data(len);
8998
8999 if (ERROR_INSERTED_CLEAR(17009))
9000 {
9001 ndbout_c("Injecting OutOfQueryMemory error 17009 at line %d file %s",
9002 __LINE__, __FILE__);
9003 jam();
9004 return DbspjErr::OutOfQueryMemory;
9005 }
9006
9007 return dst.append(&info,1) && dst.append(ptr,len) ? 0 : DbspjErr::OutOfQueryMemory;
9008 }
9009
9010 #ifdef ERROR_INSERT
9011 static int fi_cnt = 0;
9012 bool
appendToSection(Uint32 & firstSegmentIVal,const Uint32 * src,Uint32 len)9013 Dbspj::appendToSection(Uint32& firstSegmentIVal,
9014 const Uint32* src, Uint32 len)
9015 {
9016 if (ERROR_INSERTED(17510) && fi_cnt++ % 13 == 0)
9017 {
9018 jam();
9019 ndbout_c("Injecting appendToSection error 17510 at line %d file %s",
9020 __LINE__, __FILE__);
9021 return false;
9022 }
9023 else
9024 {
9025 return SimulatedBlock::appendToSection(firstSegmentIVal, src, len);
9026 }
9027 }
9028 #endif
9029
9030 Uint32
appendParamHeadToPattern(Local_pattern_store & dst,const RowPtr::Linear & row,Uint32 col)9031 Dbspj::appendParamHeadToPattern(Local_pattern_store& dst,
9032 const RowPtr::Linear & row, Uint32 col)
9033 {
9034 jam();
9035 Uint32 offset = row.m_header->m_offset[col];
9036 const Uint32 * ptr = row.m_data + offset;
9037 Uint32 len = AttributeHeader::getDataSize(*ptr);
9038 /* Param COL's converted to DATA when appended to pattern */
9039 Uint32 info = QueryPattern::data(len+1);
9040
9041 if (ERROR_INSERTED_CLEAR(17010))
9042 {
9043 ndbout_c("Injecting OutOfQueryMemory error 17010 at line %d file %s",
9044 __LINE__, __FILE__);
9045 jam();
9046 return DbspjErr::OutOfQueryMemory;
9047 }
9048
9049 return dst.append(&info,1) && dst.append(ptr,len+1) ? 0 : DbspjErr::OutOfQueryMemory;
9050 }
9051
9052 Uint32
appendReaderToSection(Uint32 & ptrI,SectionReader & reader,Uint32 len)9053 Dbspj::appendReaderToSection(Uint32 &ptrI, SectionReader &reader, Uint32 len)
9054 {
9055 while (len > 0)
9056 {
9057 jam();
9058 const Uint32* readPtr;
9059 Uint32 readLen;
9060 ndbrequire(reader.getWordsPtr(len, readPtr, readLen));
9061 if (unlikely(!appendToSection(ptrI, readPtr, readLen)))
9062 return DbspjErr::OutOfSectionMemory;
9063 len -= readLen;
9064 }
9065 return 0;
9066 }
9067
9068 void
getCorrelationData(const RowPtr::Section & row,Uint32 col,Uint32 & correlationNumber)9069 Dbspj::getCorrelationData(const RowPtr::Section & row,
9070 Uint32 col,
9071 Uint32& correlationNumber)
9072 {
9073 /**
9074 * TODO handle errors
9075 */
9076 SegmentedSectionPtr ptr(row.m_dataPtr);
9077 SectionReader reader(ptr, getSectionSegmentPool());
9078 Uint32 offset = row.m_header->m_offset[col];
9079 ndbrequire(reader.step(offset));
9080 Uint32 tmp;
9081 ndbrequire(reader.getWord(&tmp));
9082 Uint32 len = AttributeHeader::getDataSize(tmp);
9083 ndbrequire(len == 1);
9084 ndbrequire(AttributeHeader::getAttributeId(tmp) == AttributeHeader::CORR_FACTOR32);
9085 ndbrequire(reader.getWord(&correlationNumber));
9086 }
9087
9088 void
getCorrelationData(const RowPtr::Linear & row,Uint32 col,Uint32 & correlationNumber)9089 Dbspj::getCorrelationData(const RowPtr::Linear & row,
9090 Uint32 col,
9091 Uint32& correlationNumber)
9092 {
9093 /**
9094 * TODO handle errors
9095 */
9096 Uint32 offset = row.m_header->m_offset[col];
9097 Uint32 tmp = row.m_data[offset];
9098 Uint32 len = AttributeHeader::getDataSize(tmp);
9099 ndbrequire(len == 1);
9100 ndbrequire(AttributeHeader::getAttributeId(tmp) == AttributeHeader::CORR_FACTOR32);
9101 correlationNumber = row.m_data[offset+1];
9102 }
9103
9104 Uint32
appendColToSection(Uint32 & dst,const RowPtr::Section & row,Uint32 col,bool & hasNull)9105 Dbspj::appendColToSection(Uint32 & dst, const RowPtr::Section & row,
9106 Uint32 col, bool& hasNull)
9107 {
9108 jam();
9109 /**
9110 * TODO handle errors
9111 */
9112 SegmentedSectionPtr ptr(row.m_dataPtr);
9113 SectionReader reader(ptr, getSectionSegmentPool());
9114 Uint32 offset = row.m_header->m_offset[col];
9115 ndbrequire(reader.step(offset));
9116 Uint32 tmp;
9117 ndbrequire(reader.getWord(&tmp));
9118 Uint32 len = AttributeHeader::getDataSize(tmp);
9119 if (unlikely(len==0))
9120 {
9121 jam();
9122 hasNull = true; // NULL-value in key
9123 return 0;
9124 }
9125 return appendReaderToSection(dst, reader, len);
9126 }
9127
9128 Uint32
appendColToSection(Uint32 & dst,const RowPtr::Linear & row,Uint32 col,bool & hasNull)9129 Dbspj::appendColToSection(Uint32 & dst, const RowPtr::Linear & row,
9130 Uint32 col, bool& hasNull)
9131 {
9132 jam();
9133 Uint32 offset = row.m_header->m_offset[col];
9134 const Uint32 * ptr = row.m_data + offset;
9135 Uint32 len = AttributeHeader::getDataSize(* ptr ++);
9136 if (unlikely(len==0))
9137 {
9138 jam();
9139 hasNull = true; // NULL-value in key
9140 return 0;
9141 }
9142 return appendToSection(dst, ptr, len) ? 0 : DbspjErr::OutOfSectionMemory;
9143 }
9144
9145 Uint32
appendAttrinfoToSection(Uint32 & dst,const RowPtr::Linear & row,Uint32 col,bool & hasNull)9146 Dbspj::appendAttrinfoToSection(Uint32 & dst, const RowPtr::Linear & row,
9147 Uint32 col, bool& hasNull)
9148 {
9149 jam();
9150 Uint32 offset = row.m_header->m_offset[col];
9151 const Uint32 * ptr = row.m_data + offset;
9152 Uint32 len = AttributeHeader::getDataSize(* ptr);
9153 if (unlikely(len==0))
9154 {
9155 jam();
9156 hasNull = true; // NULL-value in key
9157 }
9158 return appendToSection(dst, ptr, 1 + len) ? 0 : DbspjErr::OutOfSectionMemory;
9159 }
9160
9161 Uint32
appendAttrinfoToSection(Uint32 & dst,const RowPtr::Section & row,Uint32 col,bool & hasNull)9162 Dbspj::appendAttrinfoToSection(Uint32 & dst, const RowPtr::Section & row,
9163 Uint32 col, bool& hasNull)
9164 {
9165 jam();
9166 /**
9167 * TODO handle errors
9168 */
9169 SegmentedSectionPtr ptr(row.m_dataPtr);
9170 SectionReader reader(ptr, getSectionSegmentPool());
9171 Uint32 offset = row.m_header->m_offset[col];
9172 ndbrequire(reader.step(offset));
9173 Uint32 tmp;
9174 ndbrequire(reader.peekWord(&tmp));
9175 Uint32 len = AttributeHeader::getDataSize(tmp);
9176 if (unlikely(len==0))
9177 {
9178 jam();
9179 hasNull = true; // NULL-value in key
9180 }
9181 return appendReaderToSection(dst, reader, 1 + len);
9182 }
9183
9184 /**
9185 * 'PkCol' is the composite NDB$PK column in an unique index consisting of
9186 * a fragment id and the composite PK value (all PK columns concatenated)
9187 */
9188 Uint32
appendPkColToSection(Uint32 & dst,const RowPtr::Section & row,Uint32 col)9189 Dbspj::appendPkColToSection(Uint32 & dst, const RowPtr::Section & row, Uint32 col)
9190 {
9191 jam();
9192 /**
9193 * TODO handle errors
9194 */
9195 SegmentedSectionPtr ptr(row.m_dataPtr);
9196 SectionReader reader(ptr, getSectionSegmentPool());
9197 Uint32 offset = row.m_header->m_offset[col];
9198 ndbrequire(reader.step(offset));
9199 Uint32 tmp;
9200 ndbrequire(reader.getWord(&tmp));
9201 Uint32 len = AttributeHeader::getDataSize(tmp);
9202 ndbrequire(len>1); // NULL-value in PkKey is an error
9203 ndbrequire(reader.step(1)); // Skip fragid
9204 return appendReaderToSection(dst, reader, len-1);
9205 }
9206
9207 /**
9208 * 'PkCol' is the composite NDB$PK column in an unique index consisting of
9209 * a fragment id and the composite PK value (all PK columns concatenated)
9210 */
9211 Uint32
appendPkColToSection(Uint32 & dst,const RowPtr::Linear & row,Uint32 col)9212 Dbspj::appendPkColToSection(Uint32 & dst, const RowPtr::Linear & row, Uint32 col)
9213 {
9214 jam();
9215 Uint32 offset = row.m_header->m_offset[col];
9216 Uint32 tmp = row.m_data[offset];
9217 Uint32 len = AttributeHeader::getDataSize(tmp);
9218 ndbrequire(len>1); // NULL-value in PkKey is an error
9219 return appendToSection(dst, row.m_data+offset+2, len - 1) ? 0 : DbspjErr::OutOfSectionMemory;
9220 }
9221
9222 Uint32
appendFromParent(Uint32 & dst,Local_pattern_store & pattern,Local_pattern_store::ConstDataBufferIterator & it,Uint32 levels,const RowPtr & rowptr,bool & hasNull)9223 Dbspj::appendFromParent(Uint32 & dst, Local_pattern_store& pattern,
9224 Local_pattern_store::ConstDataBufferIterator& it,
9225 Uint32 levels, const RowPtr & rowptr,
9226 bool& hasNull)
9227 {
9228 jam();
9229 Ptr<TreeNode> treeNodePtr;
9230 m_treenode_pool.getPtr(treeNodePtr, rowptr.m_src_node_ptrI);
9231 Uint32 corrVal = rowptr.m_src_correlation;
9232 RowPtr targetRow;
9233 DEBUG("appendFromParent-of"
9234 << " node: " << treeNodePtr.p->m_node_no);
9235 while (levels--)
9236 {
9237 jam();
9238 if (unlikely(treeNodePtr.p->m_parentPtrI == RNIL))
9239 {
9240 DEBUG_CRASH();
9241 return DbspjErr::InvalidPattern;
9242 }
9243 m_treenode_pool.getPtr(treeNodePtr, treeNodePtr.p->m_parentPtrI);
9244 DEBUG("appendFromParent"
9245 << ", node: " << treeNodePtr.p->m_node_no);
9246 if (unlikely(treeNodePtr.p->m_rows.m_type != RowCollection::COLLECTION_MAP))
9247 {
9248 DEBUG_CRASH();
9249 return DbspjErr::InvalidPattern;
9250 }
9251
9252 RowRef ref;
9253 treeNodePtr.p->m_rows.m_map.copyto(ref);
9254 const Uint32* const mapptr = get_row_ptr(ref);
9255
9256 Uint32 pos = corrVal >> 16; // parent corr-val
9257 if (unlikely(! (pos < treeNodePtr.p->m_rows.m_map.m_size)))
9258 {
9259 DEBUG_CRASH();
9260 return DbspjErr::InvalidPattern;
9261 }
9262
9263 // load ref to parent row
9264 treeNodePtr.p->m_rows.m_map.load(mapptr, pos, ref);
9265
9266 const Uint32* const rowptr = get_row_ptr(ref);
9267 setupRowPtr(treeNodePtr, targetRow, ref, rowptr);
9268
9269 if (levels)
9270 {
9271 jam();
9272 getCorrelationData(targetRow.m_row_data.m_linear,
9273 targetRow.m_row_data.m_linear.m_header->m_len - 1,
9274 corrVal);
9275 }
9276 }
9277
9278 if (unlikely(it.isNull()))
9279 {
9280 DEBUG_CRASH();
9281 return DbspjErr::InvalidPattern;
9282 }
9283
9284 Uint32 info = *it.data;
9285 Uint32 type = QueryPattern::getType(info);
9286 Uint32 val = QueryPattern::getLength(info);
9287 pattern.next(it);
9288 switch(type){
9289 case QueryPattern::P_COL:
9290 jam();
9291 return appendColToSection(dst, targetRow.m_row_data.m_linear, val, hasNull);
9292 case QueryPattern::P_UNQ_PK:
9293 jam();
9294 return appendPkColToSection(dst, targetRow.m_row_data.m_linear, val);
9295 case QueryPattern::P_ATTRINFO:
9296 jam();
9297 return appendAttrinfoToSection(dst, targetRow.m_row_data.m_linear, val, hasNull);
9298 case QueryPattern::P_DATA:
9299 jam();
9300 // retreiving DATA from parent...is...an error
9301 DEBUG_CRASH();
9302 return DbspjErr::InvalidPattern;
9303 case QueryPattern::P_PARENT:
9304 jam();
9305 // no point in nesting P_PARENT...an error
9306 DEBUG_CRASH();
9307 return DbspjErr::InvalidPattern;
9308 case QueryPattern::P_PARAM:
9309 case QueryPattern::P_PARAM_HEADER:
9310 jam();
9311 // should have been expanded during build
9312 DEBUG_CRASH();
9313 return DbspjErr::InvalidPattern;
9314 default:
9315 jam();
9316 DEBUG_CRASH();
9317 return DbspjErr::InvalidPattern;
9318 }
9319 }
9320
9321 Uint32
appendDataToSection(Uint32 & ptrI,Local_pattern_store & pattern,Local_pattern_store::ConstDataBufferIterator & it,Uint32 len,bool & hasNull)9322 Dbspj::appendDataToSection(Uint32 & ptrI,
9323 Local_pattern_store& pattern,
9324 Local_pattern_store::ConstDataBufferIterator& it,
9325 Uint32 len, bool& hasNull)
9326 {
9327 jam();
9328 if (unlikely(len==0))
9329 {
9330 jam();
9331 hasNull = true;
9332 return 0;
9333 }
9334
9335 #if 0
9336 /**
9337 * TODO handle errors
9338 */
9339 Uint32 tmp[NDB_SECTION_SEGMENT_SZ];
9340 while (len > NDB_SECTION_SEGMENT_SZ)
9341 {
9342 pattern.copyout(tmp, NDB_SECTION_SEGMENT_SZ, it);
9343 appendToSection(ptrI, tmp, NDB_SECTION_SEGMENT_SZ);
9344 len -= NDB_SECTION_SEGMENT_SZ;
9345 }
9346
9347 pattern.copyout(tmp, len, it);
9348 appendToSection(ptrI, tmp, len);
9349 return 0;
9350 #else
9351 Uint32 remaining = len;
9352 Uint32 dstIdx = 0;
9353 Uint32 tmp[NDB_SECTION_SEGMENT_SZ];
9354
9355 while (remaining > 0 && !it.isNull())
9356 {
9357 tmp[dstIdx] = *it.data;
9358 remaining--;
9359 dstIdx++;
9360 pattern.next(it);
9361 if (dstIdx == NDB_SECTION_SEGMENT_SZ || remaining == 0)
9362 {
9363 if (!appendToSection(ptrI, tmp, dstIdx))
9364 {
9365 jam();
9366 return DbspjErr::OutOfSectionMemory;
9367 }
9368 dstIdx = 0;
9369 }
9370 }
9371 if (remaining > 0)
9372 {
9373 DEBUG_CRASH();
9374 return DbspjErr::InvalidPattern;
9375 }
9376 else
9377 {
9378 return 0;
9379 }
9380 #endif
9381 }
9382
9383 /**
9384 * This function takes a pattern and a row and expands it into a section
9385 */
9386 Uint32
expandS(Uint32 & _dst,Local_pattern_store & pattern,const RowPtr & row,bool & hasNull)9387 Dbspj::expandS(Uint32 & _dst, Local_pattern_store& pattern,
9388 const RowPtr & row, bool& hasNull)
9389 {
9390 Uint32 err;
9391 Uint32 dst = _dst;
9392 hasNull = false;
9393 Local_pattern_store::ConstDataBufferIterator it;
9394 pattern.first(it);
9395 while (!it.isNull())
9396 {
9397 Uint32 info = *it.data;
9398 Uint32 type = QueryPattern::getType(info);
9399 Uint32 val = QueryPattern::getLength(info);
9400 pattern.next(it);
9401 switch(type){
9402 case QueryPattern::P_COL:
9403 jam();
9404 err = appendColToSection(dst, row.m_row_data.m_section, val, hasNull);
9405 break;
9406 case QueryPattern::P_UNQ_PK:
9407 jam();
9408 err = appendPkColToSection(dst, row.m_row_data.m_section, val);
9409 break;
9410 case QueryPattern::P_ATTRINFO:
9411 jam();
9412 err = appendAttrinfoToSection(dst, row.m_row_data.m_section, val, hasNull);
9413 break;
9414 case QueryPattern::P_DATA:
9415 jam();
9416 err = appendDataToSection(dst, pattern, it, val, hasNull);
9417 break;
9418 case QueryPattern::P_PARENT:
9419 jam();
9420 // P_PARENT is a prefix to another pattern token
9421 // that permits code to access rows from earlier than immediate parent.
9422 // val is no of levels to move up the tree
9423 err = appendFromParent(dst, pattern, it, val, row, hasNull);
9424 break;
9425 // PARAM's was converted to DATA by ::expand(pattern...)
9426 case QueryPattern::P_PARAM:
9427 case QueryPattern::P_PARAM_HEADER:
9428 default:
9429 jam();
9430 err = DbspjErr::InvalidPattern;
9431 DEBUG_CRASH();
9432 }
9433 if (unlikely(err != 0))
9434 {
9435 jam();
9436 _dst = dst;
9437 return err;
9438 }
9439 }
9440
9441 _dst = dst;
9442 return 0;
9443 }
9444
9445 /**
9446 * This function takes a pattern and a row and expands it into a section
9447 */
9448 Uint32
expandL(Uint32 & _dst,Local_pattern_store & pattern,const RowPtr & row,bool & hasNull)9449 Dbspj::expandL(Uint32 & _dst, Local_pattern_store& pattern,
9450 const RowPtr & row, bool& hasNull)
9451 {
9452 Uint32 err;
9453 Uint32 dst = _dst;
9454 hasNull = false;
9455 Local_pattern_store::ConstDataBufferIterator it;
9456 pattern.first(it);
9457 while (!it.isNull())
9458 {
9459 Uint32 info = *it.data;
9460 Uint32 type = QueryPattern::getType(info);
9461 Uint32 val = QueryPattern::getLength(info);
9462 pattern.next(it);
9463 switch(type){
9464 case QueryPattern::P_COL:
9465 jam();
9466 err = appendColToSection(dst, row.m_row_data.m_linear, val, hasNull);
9467 break;
9468 case QueryPattern::P_UNQ_PK:
9469 jam();
9470 err = appendPkColToSection(dst, row.m_row_data.m_linear, val);
9471 break;
9472 case QueryPattern::P_ATTRINFO:
9473 jam();
9474 err = appendAttrinfoToSection(dst, row.m_row_data.m_linear, val, hasNull);
9475 break;
9476 case QueryPattern::P_DATA:
9477 jam();
9478 err = appendDataToSection(dst, pattern, it, val, hasNull);
9479 break;
9480 case QueryPattern::P_PARENT:
9481 jam();
9482 // P_PARENT is a prefix to another pattern token
9483 // that permits code to access rows from earlier than immediate parent
9484 // val is no of levels to move up the tree
9485 err = appendFromParent(dst, pattern, it, val, row, hasNull);
9486 break;
9487 // PARAM's was converted to DATA by ::expand(pattern...)
9488 case QueryPattern::P_PARAM:
9489 case QueryPattern::P_PARAM_HEADER:
9490 default:
9491 jam();
9492 err = DbspjErr::InvalidPattern;
9493 DEBUG_CRASH();
9494 }
9495 if (unlikely(err != 0))
9496 {
9497 jam();
9498 _dst = dst;
9499 return err;
9500 }
9501 }
9502
9503 _dst = dst;
9504 return 0;
9505 }
9506
9507 /* ::expand() used during initial 'build' phase on 'tree' + 'param' from API */
9508 Uint32
expand(Uint32 & ptrI,DABuffer & pattern,Uint32 len,DABuffer & param,Uint32 paramCnt,bool & hasNull)9509 Dbspj::expand(Uint32 & ptrI, DABuffer& pattern, Uint32 len,
9510 DABuffer& param, Uint32 paramCnt, bool& hasNull)
9511 {
9512 jam();
9513 /**
9514 * TODO handle error
9515 */
9516 Uint32 err = 0;
9517 Uint32 tmp[1+MAX_ATTRIBUTES_IN_TABLE];
9518 struct RowPtr::Linear row;
9519 row.m_data = param.ptr;
9520 row.m_header = CAST_PTR(RowPtr::Header, &tmp[0]);
9521 buildRowHeader(CAST_PTR(RowPtr::Header, &tmp[0]), param.ptr, paramCnt);
9522
9523 Uint32 dst = ptrI;
9524 const Uint32 * ptr = pattern.ptr;
9525 const Uint32 * end = ptr + len;
9526 hasNull = false;
9527
9528 for (; ptr < end; )
9529 {
9530 Uint32 info = * ptr++;
9531 Uint32 type = QueryPattern::getType(info);
9532 Uint32 val = QueryPattern::getLength(info);
9533 switch(type){
9534 case QueryPattern::P_PARAM:
9535 jam();
9536 ndbassert(val < paramCnt);
9537 err = appendColToSection(dst, row, val, hasNull);
9538 break;
9539 case QueryPattern::P_PARAM_HEADER:
9540 jam();
9541 ndbassert(val < paramCnt);
9542 err = appendAttrinfoToSection(dst, row, val, hasNull);
9543 break;
9544 case QueryPattern::P_DATA:
9545 if (unlikely(val==0))
9546 {
9547 jam();
9548 hasNull = true;
9549 }
9550 else if (likely(appendToSection(dst, ptr, val)))
9551 {
9552 jam();
9553 ptr += val;
9554 }
9555 else
9556 {
9557 jam();
9558 err = DbspjErr::OutOfSectionMemory;
9559 }
9560 break;
9561 case QueryPattern::P_COL: // (linked) COL's not expected here
9562 case QueryPattern::P_PARENT: // Prefix to P_COL
9563 case QueryPattern::P_ATTRINFO:
9564 case QueryPattern::P_UNQ_PK:
9565 default:
9566 jam();
9567 jamLine(type);
9568 err = DbspjErr::InvalidPattern;
9569 }
9570 if (unlikely(err != 0))
9571 {
9572 jam();
9573 ptrI = dst;
9574 return err;
9575 }
9576 }
9577
9578 /**
9579 * Iterate forward
9580 */
9581 pattern.ptr = end;
9582 ptrI = dst;
9583 return 0;
9584 }
9585
9586 /* ::expand() used during initial 'build' phase on 'tree' + 'param' from API */
9587 Uint32
expand(Local_pattern_store & dst,Ptr<TreeNode> treeNodePtr,DABuffer & pattern,Uint32 len,DABuffer & param,Uint32 paramCnt)9588 Dbspj::expand(Local_pattern_store& dst, Ptr<TreeNode> treeNodePtr,
9589 DABuffer& pattern, Uint32 len,
9590 DABuffer& param, Uint32 paramCnt)
9591 {
9592 jam();
9593 /**
9594 * TODO handle error
9595 */
9596 Uint32 err;
9597 Uint32 tmp[1+MAX_ATTRIBUTES_IN_TABLE];
9598 struct RowPtr::Linear row;
9599 row.m_header = CAST_PTR(RowPtr::Header, &tmp[0]);
9600 row.m_data = param.ptr;
9601 buildRowHeader(CAST_PTR(RowPtr::Header, &tmp[0]), param.ptr, paramCnt);
9602
9603 const Uint32 * end = pattern.ptr + len;
9604 for (; pattern.ptr < end; )
9605 {
9606 Uint32 info = *pattern.ptr;
9607 Uint32 type = QueryPattern::getType(info);
9608 Uint32 val = QueryPattern::getLength(info);
9609 switch(type){
9610 case QueryPattern::P_COL:
9611 case QueryPattern::P_UNQ_PK:
9612 case QueryPattern::P_ATTRINFO:
9613 jam();
9614 err = appendToPattern(dst, pattern, 1);
9615 break;
9616 case QueryPattern::P_DATA:
9617 jam();
9618 err = appendToPattern(dst, pattern, val+1);
9619 break;
9620 case QueryPattern::P_PARAM:
9621 jam();
9622 // NOTE: Converted to P_DATA by appendParamToPattern
9623 ndbassert(val < paramCnt);
9624 err = appendParamToPattern(dst, row, val);
9625 pattern.ptr++;
9626 break;
9627 case QueryPattern::P_PARAM_HEADER:
9628 jam();
9629 // NOTE: Converted to P_DATA by appendParamHeadToPattern
9630 ndbassert(val < paramCnt);
9631 err = appendParamHeadToPattern(dst, row, val);
9632 pattern.ptr++;
9633 break;
9634 case QueryPattern::P_PARENT: // Prefix to P_COL
9635 {
9636 jam();
9637 err = appendToPattern(dst, pattern, 1);
9638 if (unlikely(err))
9639 {
9640 jam();
9641 break;
9642 }
9643 // Locate requested grandparent and request it to
9644 // T_BUFFER_ROW its result rows
9645 Ptr<TreeNode> parentPtr;
9646 m_treenode_pool.getPtr(parentPtr, treeNodePtr.p->m_parentPtrI);
9647 while (val--)
9648 {
9649 jam();
9650 ndbassert(parentPtr.p->m_parentPtrI != RNIL);
9651 m_treenode_pool.getPtr(parentPtr, parentPtr.p->m_parentPtrI);
9652 parentPtr.p->m_bits |= TreeNode::T_BUFFER_ROW;
9653 parentPtr.p->m_bits |= TreeNode::T_BUFFER_MAP;
9654 }
9655 Ptr<Request> requestPtr;
9656 m_request_pool.getPtr(requestPtr, treeNodePtr.p->m_requestPtrI);
9657 requestPtr.p->m_bits |= Request::RT_BUFFERS;
9658 break;
9659 }
9660 default:
9661 err = DbspjErr::InvalidPattern;
9662 jam();
9663 }
9664
9665 if (unlikely(err != 0))
9666 {
9667 jam();
9668 return err;
9669 }
9670 }
9671 return 0;
9672 }
9673
9674 Uint32
parseDA(Build_context & ctx,Ptr<Request> requestPtr,Ptr<TreeNode> treeNodePtr,DABuffer & tree,Uint32 treeBits,DABuffer & param,Uint32 paramBits)9675 Dbspj::parseDA(Build_context& ctx,
9676 Ptr<Request> requestPtr,
9677 Ptr<TreeNode> treeNodePtr,
9678 DABuffer& tree, Uint32 treeBits,
9679 DABuffer& param, Uint32 paramBits)
9680 {
9681 Uint32 err;
9682 Uint32 attrInfoPtrI = RNIL;
9683 Uint32 attrParamPtrI = RNIL;
9684
9685 do
9686 {
9687 /**
9688 * Test execution terminated due to 'OutOfSectionMemory' which
9689 * may happen multiple places (eg. appendtosection, expand) below:
9690 * - 17050: Fail on parseDA at first call
9691 * - 17051: Fail on parseDA if 'isLeaf'
9692 * - 17052: Fail on parseDA if treeNode not root
9693 * - 17053: Fail on parseDA at a random node of the query tree
9694 */
9695 if (ERROR_INSERTED(17050) ||
9696 (ERROR_INSERTED(17051) && (treeNodePtr.p->isLeaf())) ||
9697 (ERROR_INSERTED(17052) && (treeNodePtr.p->m_parentPtrI != RNIL)) ||
9698 (ERROR_INSERTED(17053) && (rand() % 7) == 0))
9699 {
9700 jam();
9701 CLEAR_ERROR_INSERT_VALUE;
9702 ndbout_c("Injecting OutOfSectionMemory error at line %d file %s",
9703 __LINE__, __FILE__);
9704 err = DbspjErr::OutOfSectionMemory;
9705 break;
9706 }
9707
9708 if (treeBits & DABits::NI_REPEAT_SCAN_RESULT)
9709 {
9710 jam();
9711 DEBUG("use REPEAT_SCAN_RESULT when returning results");
9712 requestPtr.p->m_bits |= Request::RT_REPEAT_SCAN_RESULT;
9713 } // DABits::NI_REPEAT_SCAN_RESULT
9714
9715 if (treeBits & DABits::NI_INNER_JOIN)
9716 {
9717 jam();
9718 DEBUG("INNER_JOIN optimization used");
9719 treeNodePtr.p->m_bits |= TreeNode::T_INNER_JOIN;
9720 } // DABits::NI_INNER_JOIN
9721
9722 // TODO: FirstMatch not implemented in SPJ block yet.
9723 // Later implementation will build on the BUFFER_ROW / _MATCH mechanisms
9724 // to eliminate already found matches from SCAN_NEXTREQ
9725 if (treeBits & DABits::NI_FIRST_MATCH)
9726 {
9727 jam();
9728 DEBUG("FIRST_MATCH optimization used");
9729 treeNodePtr.p->m_bits |= TreeNode::T_FIRST_MATCH;
9730 } // DABits::NI_FIRST_MATCH
9731
9732 if (treeBits & DABits::NI_HAS_PARENT)
9733 {
9734 jam();
9735 DEBUG("NI_HAS_PARENT");
9736 /**
9737 * OPTIONAL PART 1:
9738 *
9739 * Parent nodes are stored first in optional part
9740 * this is a list of 16-bit numbers refering to
9741 * *earlier* nodes in tree
9742 * the list stores length of list as first 16-bit
9743 */
9744 err = DbspjErr::InvalidTreeNodeSpecification;
9745 Uint32 dst[63];
9746 Uint32 cnt = unpackList(NDB_ARRAY_SIZE(dst), dst, tree);
9747 if (unlikely(cnt > NDB_ARRAY_SIZE(dst)))
9748 {
9749 jam();
9750 break;
9751 }
9752
9753 if (unlikely(cnt!=1))
9754 {
9755 /**
9756 * Only a single parent supported for now, i.e only trees
9757 */
9758 jam();
9759 break;
9760 }
9761
9762 err = 0;
9763 for (Uint32 i = 0; i<cnt; i++)
9764 {
9765 DEBUG("adding " << dst[i] << " as parent");
9766 Ptr<TreeNode> parentPtr = ctx.m_node_list[dst[i]];
9767 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
9768 Local_dependency_map map(pool, parentPtr.p->m_child_nodes);
9769 if (unlikely(!map.append(&treeNodePtr.i, 1)))
9770 {
9771 err = DbspjErr::OutOfQueryMemory;
9772 jam();
9773 break;
9774 }
9775 treeNodePtr.p->m_parentPtrI = parentPtr.i;
9776 }
9777
9778 if (unlikely(err != 0))
9779 break;
9780 } // DABits::NI_HAS_PARENT
9781
9782 err = DbspjErr::InvalidTreeParametersSpecificationKeyParamBitsMissmatch;
9783 if (unlikely( ((treeBits & DABits::NI_KEY_PARAMS)==0) !=
9784 ((paramBits & DABits::PI_KEY_PARAMS)==0)))
9785 {
9786 jam();
9787 break;
9788 }
9789
9790 if (treeBits & (DABits::NI_KEY_PARAMS
9791 | DABits::NI_KEY_LINKED
9792 | DABits::NI_KEY_CONSTS))
9793 {
9794 jam();
9795 DEBUG("NI_KEY_PARAMS | NI_KEY_LINKED | NI_KEY_CONSTS");
9796
9797 /**
9798 * OPTIONAL PART 2:
9799 *
9800 * If keys are parametrized or linked
9801 * DATA0[LO/HI] - Length of key pattern/#parameters to key
9802 */
9803 Uint32 len_cnt = * tree.ptr ++;
9804 Uint32 len = len_cnt & 0xFFFF; // length of pattern in words
9805 Uint32 cnt = len_cnt >> 16; // no of parameters
9806
9807 err = DbspjErr::InvalidTreeParametersSpecificationIncorrectKeyParamCount;
9808 if (unlikely( ((cnt==0) != ((treeBits & DABits::NI_KEY_PARAMS) == 0)) ||
9809 ((cnt==0) != ((paramBits & DABits::PI_KEY_PARAMS) == 0))))
9810 {
9811 jam();
9812 break;
9813 }
9814
9815 if (treeBits & DABits::NI_KEY_LINKED)
9816 {
9817 jam();
9818 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena, m_dependency_map_pool);
9819 Local_pattern_store pattern(pool, treeNodePtr.p->m_keyPattern);
9820
9821 DEBUG("LINKED-KEY PATTERN w/ " << cnt << " PARAM values");
9822 /**
9823 * Expand pattern into a new pattern (with linked values)
9824 */
9825 err = expand(pattern, treeNodePtr, tree, len, param, cnt);
9826 if (unlikely(err != 0))
9827 {
9828 jam();
9829 break;
9830 }
9831 /**
9832 * This node constructs a new key for each send
9833 */
9834 treeNodePtr.p->m_bits |= TreeNode::T_KEYINFO_CONSTRUCTED;
9835 }
9836 else
9837 {
9838 jam();
9839 DEBUG("FIXED-KEY w/ " << cnt << " PARAM values");
9840 /**
9841 * Expand pattern directly into keyinfo
9842 * This means a "fixed" key from here on
9843 */
9844 bool hasNull;
9845 Uint32 keyInfoPtrI = RNIL;
9846 err = expand(keyInfoPtrI, tree, len, param, cnt, hasNull);
9847 if (unlikely(err != 0))
9848 {
9849 jam();
9850 releaseSection(keyInfoPtrI);
9851 break;
9852 }
9853 if (unlikely(hasNull))
9854 {
9855 /* API should have elliminated requests w/ const-NULL keys */
9856 jam();
9857 DEBUG("BEWARE: FIXED-key contain NULL values");
9858 releaseSection(keyInfoPtrI);
9859 // treeNodePtr.p->m_bits |= TreeNode::T_NULL_PRUNE;
9860 // break;
9861 ndbabort();
9862 }
9863 treeNodePtr.p->m_send.m_keyInfoPtrI = keyInfoPtrI;
9864 }
9865 ndbassert(err == 0); // All errors should have been handled
9866 } // DABits::NI_KEY_...
9867
9868 const Uint32 mask =
9869 DABits::NI_LINKED_ATTR | DABits::NI_ATTR_INTERPRET |
9870 DABits::NI_ATTR_LINKED | DABits::NI_ATTR_PARAMS;
9871
9872 if (((treeBits & mask) | (paramBits & DABits::PI_ATTR_LIST)) != 0)
9873 {
9874 jam();
9875 /**
9876 * OPTIONAL PART 3: attrinfo handling
9877 * - NI_LINKED_ATTR - these are attributes to be passed to children
9878 * - PI_ATTR_LIST - this is "user-columns" (passed as parameters)
9879
9880 * - NI_ATTR_INTERPRET - tree contains interpreted program
9881 * - NI_ATTR_LINKED - means that the attr-info contains linked-values
9882 * - NI_ATTR_PARAMS - means that the attr-info is parameterized
9883 * PI_ATTR_PARAMS - means that the parameters contains attr parameters
9884 *
9885 * IF NI_ATTR_INTERPRET
9886 * DATA0[LO/HI] = Length of program / total #arguments to program
9887 * DATA1..N = Program
9888 *
9889 * IF NI_ATTR_PARAMS
9890 * DATA0[LO/HI] = Length / #param
9891 * DATA1..N = PARAM-0...PARAM-M
9892 *
9893 * IF PI_ATTR_INTERPRET
9894 * DATA0[LO/HI] = Length of program / Length of subroutine-part
9895 * DATA1..N = Program (scan filter)
9896 *
9897 * IF NI_ATTR_LINKED
9898 * DATA0[LO/HI] = Length / #
9899 *
9900 *
9901 */
9902 Uint32 sections[5] = { 0, 0, 0, 0, 0 };
9903 Uint32 * sectionptrs = 0;
9904
9905 bool interpreted =
9906 (treeBits & DABits::NI_ATTR_INTERPRET) ||
9907 (paramBits & DABits::PI_ATTR_INTERPRET) ||
9908 (treeNodePtr.p->m_bits & TreeNode::T_ATTR_INTERPRETED);
9909
9910 if (interpreted)
9911 {
9912 /**
9913 * Add section headers for interpreted execution
9914 * and create pointer so that they can be updated later
9915 */
9916 jam();
9917 err = DbspjErr::OutOfSectionMemory;
9918 if (unlikely(!appendToSection(attrInfoPtrI, sections, 5)))
9919 {
9920 jam();
9921 break;
9922 }
9923
9924 SegmentedSectionPtr ptr;
9925 getSection(ptr, attrInfoPtrI);
9926 sectionptrs = ptr.p->theData;
9927
9928 if (treeBits & DABits::NI_ATTR_INTERPRET)
9929 {
9930 jam();
9931
9932 /**
9933 * Having two interpreter programs is an error.
9934 */
9935 err = DbspjErr::BothTreeAndParametersContainInterpretedProgram;
9936 if (unlikely(paramBits & DABits::PI_ATTR_INTERPRET))
9937 {
9938 jam();
9939 break;
9940 }
9941
9942 treeNodePtr.p->m_bits |= TreeNode::T_ATTR_INTERPRETED;
9943 Uint32 len2 = * tree.ptr++;
9944 Uint32 len_prg = len2 & 0xFFFF; // Length of interpret program
9945 Uint32 len_pattern = len2 >> 16;// Length of attr param pattern
9946 err = DbspjErr::OutOfSectionMemory;
9947 if (unlikely(!appendToSection(attrInfoPtrI, tree.ptr, len_prg)))
9948 {
9949 jam();
9950 break;
9951 }
9952
9953 tree.ptr += len_prg;
9954 sectionptrs[1] = len_prg; // size of interpret program
9955
9956 Uint32 tmp = * tree.ptr ++; // attr-pattern header
9957 Uint32 cnt = tmp & 0xFFFF;
9958
9959 if (treeBits & DABits::NI_ATTR_LINKED)
9960 {
9961 jam();
9962 /**
9963 * Expand pattern into a new pattern (with linked values)
9964 */
9965 LocalArenaPool<DataBufferSegment<14> > pool(requestPtr.p->m_arena,
9966 m_dependency_map_pool);
9967 Local_pattern_store pattern(pool,treeNodePtr.p->m_attrParamPattern);
9968 err = expand(pattern, treeNodePtr, tree, len_pattern, param, cnt);
9969 if (unlikely(err))
9970 {
9971 jam();
9972 break;
9973 }
9974 /**
9975 * This node constructs a new attr-info for each send
9976 */
9977 treeNodePtr.p->m_bits |= TreeNode::T_ATTRINFO_CONSTRUCTED;
9978 }
9979 else
9980 {
9981 jam();
9982 /**
9983 * Expand pattern directly into attr-info param
9984 * This means a "fixed" attr-info param from here on
9985 */
9986 bool hasNull;
9987 err = expand(attrParamPtrI, tree, len_pattern, param, cnt, hasNull);
9988 if (unlikely(err))
9989 {
9990 jam();
9991 break;
9992 }
9993 // ndbrequire(!hasNull);
9994 }
9995 }
9996 else // if (treeBits & DABits::NI_ATTR_INTERPRET)
9997 {
9998 jam();
9999 /**
10000 * Only relevant for interpreted stuff
10001 */
10002 ndbrequire((treeBits & DABits::NI_ATTR_PARAMS) == 0);
10003 ndbrequire((paramBits & DABits::PI_ATTR_PARAMS) == 0);
10004 ndbrequire((treeBits & DABits::NI_ATTR_LINKED) == 0);
10005
10006 treeNodePtr.p->m_bits |= TreeNode::T_ATTR_INTERPRETED;
10007
10008 if (! (paramBits & DABits::PI_ATTR_INTERPRET))
10009 {
10010 jam();
10011
10012 /**
10013 * Tree node has interpreted execution,
10014 * but no interpreted program specified
10015 * auto-add Exit_ok (i.e return each row)
10016 */
10017 Uint32 tmp = Interpreter::ExitOK();
10018 err = DbspjErr::OutOfSectionMemory;
10019 if (unlikely(!appendToSection(attrInfoPtrI, &tmp, 1)))
10020 {
10021 jam();
10022 break;
10023 }
10024 sectionptrs[1] = 1;
10025 }
10026 } // if (treeBits & DABits::NI_ATTR_INTERPRET)
10027 } // if (interpreted)
10028
10029 if (paramBits & DABits::PI_ATTR_INTERPRET)
10030 {
10031 jam();
10032
10033 /**
10034 * Add the interpreted code that represents the scan filter.
10035 */
10036 const Uint32 len2 = * param.ptr++;
10037 Uint32 program_len = len2 & 0xFFFF;
10038 Uint32 subroutine_len = len2 >> 16;
10039 err = DbspjErr::OutOfSectionMemory;
10040 if (unlikely(!appendToSection(attrInfoPtrI, param.ptr, program_len)))
10041 {
10042 jam();
10043 break;
10044 }
10045 /**
10046 * The interpreted code is added is in the "Interpreted execute region"
10047 * of the attrinfo (see Dbtup::interpreterStartLab() for details).
10048 * It will thus execute before reading the attributes that constitutes
10049 * the projections.
10050 */
10051 sectionptrs[1] = program_len;
10052 param.ptr += program_len;
10053
10054 if (subroutine_len)
10055 {
10056 if (unlikely(!appendToSection(attrParamPtrI,
10057 param.ptr, subroutine_len)))
10058 {
10059 jam();
10060 break;
10061 }
10062 sectionptrs[4] = subroutine_len;
10063 param.ptr += subroutine_len;
10064 }
10065 treeNodePtr.p->m_bits |= TreeNode::T_ATTR_INTERPRETED;
10066 }
10067
10068 Uint32 sum_read = 0;
10069 Uint32 dst[MAX_ATTRIBUTES_IN_TABLE + 2];
10070
10071 if (paramBits & DABits::PI_ATTR_LIST)
10072 {
10073 jam();
10074 Uint32 len = * param.ptr++;
10075 DEBUG("PI_ATTR_LIST");
10076
10077 treeNodePtr.p->m_bits |= TreeNode::T_USER_PROJECTION;
10078 err = DbspjErr::OutOfSectionMemory;
10079 if (!appendToSection(attrInfoPtrI, param.ptr, len))
10080 {
10081 jam();
10082 break;
10083 }
10084
10085 param.ptr += len;
10086 sum_read += len;
10087
10088 const NodeId API_node = refToNode(ctx.m_resultRef);
10089 const Uint32 API_version = getNodeInfo(API_node).m_version;
10090
10091 /**
10092 * We have just added a 'USER_PROJECTION' which is the
10093 * result row to the SPJ-API. If we will also add a
10094 * projection of SPJ keys (NI_LINKED_ATTR), we need to
10095 * insert a FLUSH of the client results now, else the
10096 * FLUSH is skipped as we produced a single result
10097 * projection only. (to API client)
10098 *
10099 * However, for scan requests we will always need to FLUSH:
10100 * LqhKeyReq::tcBlockref need to refer this SPJ block as
10101 * it is used to send the required REF/CONF to SPJ. However,
10102 * tcBlockref is also used as the 'route' dest for TRANSID_AI_R,
10103 * which should be routed to the requesting TC block. Thus
10104 * we need the FLUSH which specifies its own RouteRef.
10105 *
10106 * Also need to have this under API-version control, as
10107 * older API versions assumed that all SPJ results were
10108 * returned as 'long' signals.
10109 */
10110 if (treeBits & DABits::NI_LINKED_ATTR ||
10111 requestPtr.p->isScan() ||
10112 !ndbd_spj_api_support_short_TRANSID_AI(API_version))
10113 {
10114 /**
10115 * Insert a FLUSH_AI of 'USER_PROJECTION' result (to client)
10116 * before 'LINKED_ATTR' results to SPJ is produced.
10117 */
10118 jam();
10119 Uint32 flush[4];
10120 flush[0] = AttributeHeader::FLUSH_AI << 16;
10121 flush[1] = ctx.m_resultRef;
10122 flush[2] = ctx.m_resultData;
10123 flush[3] = ctx.m_senderRef; // RouteRef
10124 if (!appendToSection(attrInfoPtrI, flush, 4))
10125 {
10126 jam();
10127 break;
10128 }
10129 sum_read += 4;
10130 }
10131 }
10132
10133 if (treeBits & DABits::NI_LINKED_ATTR)
10134 {
10135 jam();
10136 DEBUG("NI_LINKED_ATTR");
10137 err = DbspjErr::InvalidTreeNodeSpecification;
10138 Uint32 cnt = unpackList(MAX_ATTRIBUTES_IN_TABLE, dst, tree);
10139 if (unlikely(cnt > MAX_ATTRIBUTES_IN_TABLE))
10140 {
10141 jam();
10142 break;
10143 }
10144
10145 /**
10146 * AttributeHeader contains attrId in 16-higher bits
10147 */
10148 for (Uint32 i = 0; i<cnt; i++)
10149 dst[i] <<= 16;
10150
10151 /**
10152 * Read correlation factor
10153 */
10154 dst[cnt++] = AttributeHeader::CORR_FACTOR32 << 16;
10155
10156 err = DbspjErr::OutOfSectionMemory;
10157 if (!appendToSection(attrInfoPtrI, dst, cnt))
10158 {
10159 jam();
10160 break;
10161 }
10162 sum_read += cnt;
10163 treeNodePtr.p->m_bits |= TreeNode::T_EXPECT_TRANSID_AI;
10164
10165 // Having a key projection for LINKED child, implies not-LEAF
10166 treeNodePtr.p->m_bits &= ~(Uint32)TreeNode::T_LEAF;
10167 }
10168 /**
10169 * If no LINKED_ATTR's including the CORR_FACTOR was requested by
10170 * the API, the SPJ-block does its own request of a CORR_FACTOR.
10171 * Will be used to keep track of whether a 'match' was found
10172 * for the requested parent row.
10173 */
10174 else if (requestPtr.p->isScan() &&
10175 (treeNodePtr.p->m_bits & TreeNode::T_INNER_JOIN))
10176 {
10177 jam();
10178 Uint32 cnt = 0;
10179 /**
10180 * Only read correlation factor
10181 */
10182 dst[cnt++] = AttributeHeader::CORR_FACTOR32 << 16;
10183
10184 err = DbspjErr::OutOfSectionMemory;
10185 if (!appendToSection(attrInfoPtrI, dst, cnt))
10186 {
10187 jam();
10188 break;
10189 }
10190 sum_read += cnt;
10191 treeNodePtr.p->m_bits |= TreeNode::T_EXPECT_TRANSID_AI;
10192 }
10193
10194 if (interpreted)
10195 {
10196 jam();
10197 /**
10198 * Let reads be performed *after* interpreted program
10199 * i.e in "final read"-section
10200 */
10201 sectionptrs[3] = sum_read;
10202
10203 if (attrParamPtrI != RNIL)
10204 {
10205 jam();
10206 ndbrequire(!(treeNodePtr.p->m_bits&TreeNode::T_ATTRINFO_CONSTRUCTED));
10207
10208 SegmentedSectionPtr ptr;
10209 getSection(ptr, attrParamPtrI);
10210 {
10211 SectionReader r0(ptr, getSectionSegmentPool());
10212 err = appendReaderToSection(attrInfoPtrI, r0, ptr.sz);
10213 if (unlikely(err != 0))
10214 {
10215 jam();
10216 break;
10217 }
10218 sectionptrs[4] = ptr.sz;
10219 }
10220 releaseSection(attrParamPtrI);
10221 attrParamPtrI = RNIL;
10222 }
10223 }
10224
10225 treeNodePtr.p->m_send.m_attrInfoPtrI = attrInfoPtrI;
10226 attrInfoPtrI = RNIL;
10227 } // if (((treeBits & mask) | (paramBits & DABits::PI_ATTR_LIST)) != 0)
10228
10229 // Empty attrinfo would cause node crash.
10230 if (treeNodePtr.p->m_send.m_attrInfoPtrI == RNIL)
10231 {
10232 jam();
10233
10234 // Add dummy interpreted program.
10235 Uint32 tmp = Interpreter::ExitOK();
10236 err = DbspjErr::OutOfSectionMemory;
10237 if (unlikely(!appendToSection(treeNodePtr.p->m_send.m_attrInfoPtrI, &tmp, 1)))
10238 {
10239 jam();
10240 break;
10241 }
10242 }
10243
10244 return 0;
10245 } while (0);
10246
10247 if (attrInfoPtrI != RNIL)
10248 {
10249 jam();
10250 releaseSection(attrInfoPtrI);
10251 }
10252
10253 if (attrParamPtrI != RNIL)
10254 {
10255 jam();
10256 releaseSection(attrParamPtrI);
10257 }
10258
10259 return err;
10260 }
10261
10262 /**
10263 * END - MODULE COMMON PARSE/UNPACK
10264 */
10265
10266 /**
10267 * Process a scan request for an ndb$info table. (These are used for monitoring
10268 * purposes and do not contain application data.)
10269 */
execDBINFO_SCANREQ(Signal * signal)10270 void Dbspj::execDBINFO_SCANREQ(Signal *signal)
10271 {
10272 DbinfoScanReq req= * CAST_PTR(DbinfoScanReq, &signal->theData[0]);
10273 const Ndbinfo::ScanCursor* cursor =
10274 CAST_CONSTPTR(Ndbinfo::ScanCursor, DbinfoScan::getCursorPtr(&req));
10275 Ndbinfo::Ratelimit rl;
10276
10277 jamEntry();
10278
10279 switch(req.tableId){
10280
10281 // The SPJ block only implements the ndbinfo.counters table.
10282 case Ndbinfo::COUNTERS_TABLEID:
10283 {
10284 Ndbinfo::counter_entry counters[] = {
10285 { Ndbinfo::SPJ_READS_RECEIVED_COUNTER,
10286 c_Counters.get_counter(CI_READS_RECEIVED) },
10287 { Ndbinfo::SPJ_LOCAL_READS_SENT_COUNTER,
10288 c_Counters.get_counter(CI_LOCAL_READS_SENT) },
10289 { Ndbinfo::SPJ_REMOTE_READS_SENT_COUNTER,
10290 c_Counters.get_counter(CI_REMOTE_READS_SENT) },
10291 { Ndbinfo::SPJ_READS_NOT_FOUND_COUNTER,
10292 c_Counters.get_counter(CI_READS_NOT_FOUND) },
10293 { Ndbinfo::SPJ_TABLE_SCANS_RECEIVED_COUNTER,
10294 c_Counters.get_counter(CI_TABLE_SCANS_RECEIVED) },
10295 { Ndbinfo::SPJ_LOCAL_TABLE_SCANS_SENT_COUNTER,
10296 c_Counters.get_counter(CI_LOCAL_TABLE_SCANS_SENT) },
10297 { Ndbinfo::SPJ_RANGE_SCANS_RECEIVED_COUNTER,
10298 c_Counters.get_counter(CI_RANGE_SCANS_RECEIVED) },
10299 { Ndbinfo::SPJ_LOCAL_RANGE_SCANS_SENT_COUNTER,
10300 c_Counters.get_counter(CI_LOCAL_RANGE_SCANS_SENT) },
10301 { Ndbinfo::SPJ_REMOTE_RANGE_SCANS_SENT_COUNTER,
10302 c_Counters.get_counter(CI_REMOTE_RANGE_SCANS_SENT) },
10303 { Ndbinfo::SPJ_SCAN_BATCHES_RETURNED_COUNTER,
10304 c_Counters.get_counter(CI_SCAN_BATCHES_RETURNED) },
10305 { Ndbinfo::SPJ_SCAN_ROWS_RETURNED_COUNTER,
10306 c_Counters.get_counter(CI_SCAN_ROWS_RETURNED) },
10307 { Ndbinfo::SPJ_PRUNED_RANGE_SCANS_RECEIVED_COUNTER,
10308 c_Counters.get_counter(CI_PRUNED_RANGE_SCANS_RECEIVED) },
10309 { Ndbinfo::SPJ_CONST_PRUNED_RANGE_SCANS_RECEIVED_COUNTER,
10310 c_Counters.get_counter(CI_CONST_PRUNED_RANGE_SCANS_RECEIVED) }
10311 };
10312 const size_t num_counters = sizeof(counters) / sizeof(counters[0]);
10313
10314 Uint32 i = cursor->data[0];
10315 const BlockNumber bn = blockToMain(number());
10316 while(i < num_counters)
10317 {
10318 jam();
10319 Ndbinfo::Row row(signal, req);
10320 row.write_uint32(getOwnNodeId());
10321 row.write_uint32(bn); // block number
10322 row.write_uint32(instance()); // block instance
10323 row.write_uint32(counters[i].id);
10324
10325 row.write_uint64(counters[i].val);
10326 ndbinfo_send_row(signal, req, row, rl);
10327 i++;
10328 if (rl.need_break(req))
10329 {
10330 jam();
10331 ndbinfo_send_scan_break(signal, req, rl, i);
10332 return;
10333 }
10334 }
10335 break;
10336 }
10337
10338 default:
10339 break;
10340 }
10341
10342 ndbinfo_send_scan_conf(signal, req, rl);
10343 } // Dbspj::execDBINFO_SCANREQ(Signal *signal)
10344
10345
10346 /**
10347 * Incremental calculation of standard deviation:
10348 *
10349 * Suppose that the data set is x1, x2,..., xn then for each xn
10350 * we can find an updated mean (M) and square of sums (S) as:
10351 *
10352 * M(1) = x(1), M(k) = M(k-1) + (x(k) - M(k-1)) / k
10353 * S(1) = 0, S(k) = S(k-1) + (x(k) - M(k-1)) * (x(k) - M(k))
10354 *
10355 * Source: http://mathcentral.uregina.ca/QQ/database/QQ.09.02/carlos1.html
10356 */
sample(double observation)10357 void Dbspj::IncrementalStatistics::sample(double observation)
10358 {
10359 // Prevent wrap-around
10360 if(m_noOfSamples < 0xffffffff)
10361 {
10362 m_noOfSamples++;
10363 const double delta = observation - m_mean;
10364 m_mean += delta/m_noOfSamples;
10365 m_sumSquare += delta * (observation - m_mean);
10366 }
10367 }
10368