1 /* Copyright (C) 2014 InfiniDB, Inc.
2 
3    This program is free software; you can redistribute it and/or
4    modify it under the terms of the GNU General Public License
5    as published by the Free Software Foundation; version 2 of
6    the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
16    MA 02110-1301, USA. */
17 
18 /*****************************************************************************
19  * $Id: extentmap.cpp 1936 2013-07-09 22:10:29Z dhall $
20  *
21  ****************************************************************************/
22 
23 #include <iostream>
24 #include <sys/types.h>
25 #include <sys/time.h>
26 #include <sys/stat.h>
27 #include <cstdlib>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <stdexcept>
31 #include <algorithm>
32 #include <ios>
33 #include <cerrno>
34 #include <sstream>
35 #include <vector>
36 #include <limits>
37 #include <boost/scoped_array.hpp>
38 #include <boost/scoped_ptr.hpp>
39 #include <boost/thread.hpp>
40 #ifndef _MSC_VER
41 #include <tr1/unordered_set>
42 #else
43 #include <unordered_set>
44 #endif
45 
46 #include <boost/interprocess/shared_memory_object.hpp>
47 #include <boost/interprocess/mapped_region.hpp>
48 namespace bi = boost::interprocess;
49 
50 #include "liboamcpp.h"
51 #include "brmtypes.h"
52 #include "configcpp.h"
53 #include "rwlock.h"
54 #include "calpontsystemcatalog.h"
55 #include "mastersegmenttable.h"
56 #include "blocksize.h"
57 #include "dataconvert.h"
58 #include "oamcache.h"
59 #include "IDBDataFile.h"
60 #include "IDBPolicy.h"
61 #ifdef BRM_INFO
62 #include "tracer.h"
63 #include "configcpp.h"
64 #endif
65 
66 #define EXTENTMAP_DLLEXPORT
67 #include "extentmap.h"
68 #undef EXTENTMAP_DLLEXPORT
69 
70 #define EM_MAX_SEQNUM               2000000000
71 #define MAX_IO_RETRIES 10
72 #define EM_MAGIC_V1 0x76f78b1c
73 #define EM_MAGIC_V2 0x76f78b1d
74 #define EM_MAGIC_V3 0x76f78b1e
75 #define EM_MAGIC_V4 0x76f78b1f
76 
77 #ifndef NDEBUG
78 #define ASSERT(x) \
79 	if (!(x)) { \
80 		cerr << "assertion at file " << __FILE__ << " line " << __LINE__ << " failed" << endl; \
81 		throw logic_error("assertion failed"); \
82 	}
83 #else
84 #define ASSERT(x)
85 #endif
86 
87 using namespace std;
88 using namespace boost;
89 using namespace logging;
90 using namespace idbdatafile;
91 
92 namespace
93 {
94 unsigned ExtentSize = 0; // dmc-need to deprecate
95 unsigned ExtentRows              = 0;
96 unsigned filesPerColumnPartition = 0;
97 unsigned extentsPerSegmentFile   = 0;
98 
99 // Increment CP sequence (version) number, and wrap-around when applicable
incSeqNum(int32_t & seqNum)100 inline void incSeqNum(int32_t& seqNum)
101 {
102     seqNum++;
103 
104     if (seqNum > EM_MAX_SEQNUM)
105         seqNum = 0;
106 }
107 
108 }
109 
110 namespace BRM
111 {
112 
113 //------------------------------------------------------------------------------
114 // EMCasualPartition_struct methods
115 //------------------------------------------------------------------------------
116 
EMCasualPartition_struct()117 EMCasualPartition_struct::EMCasualPartition_struct()
118 {
119     lo_val = numeric_limits<int64_t>::min();
120     hi_val = numeric_limits<int64_t>::max();
121     sequenceNum = 0;
122     isValid = CP_INVALID;
123 }
124 
EMCasualPartition_struct(const int64_t lo,const int64_t hi,const int32_t seqNum)125 EMCasualPartition_struct::EMCasualPartition_struct(const int64_t lo, const int64_t hi, const int32_t seqNum)
126 {
127     lo_val = lo;
128     hi_val = hi;
129     sequenceNum = seqNum;
130     isValid = CP_INVALID;
131 }
132 
EMCasualPartition_struct(const EMCasualPartition_struct & em)133 EMCasualPartition_struct::EMCasualPartition_struct(const EMCasualPartition_struct& em)
134 {
135     lo_val = em.lo_val;
136     hi_val = em.hi_val;
137     sequenceNum = em.sequenceNum;
138     isValid = em.isValid;
139 }
140 
operator =(const EMCasualPartition_struct & em)141 EMCasualPartition_struct& EMCasualPartition_struct::operator= (const EMCasualPartition_struct& em)
142 {
143     lo_val = em.lo_val;
144     hi_val = em.hi_val;
145     sequenceNum = em.sequenceNum;
146     isValid = em.isValid;
147     return *this;
148 }
149 
150 //------------------------------------------------------------------------------
151 // Version 4 EmEntry methods
152 //------------------------------------------------------------------------------
153 
EMEntry()154 EMEntry::EMEntry()
155 {
156     fileID = 0;
157     blockOffset = 0;
158     HWM = 0;
159     partitionNum = 0;
160     segmentNum   = 0;
161     dbRoot       = 0;
162     colWid       = 0;
163     status		= 0;
164 }
165 
EMEntry(const EMEntry & e)166 EMEntry::EMEntry(const EMEntry& e)
167 {
168     range.start = e.range.start;
169     range.size = e.range.size;
170     fileID = e.fileID;
171     blockOffset = e.blockOffset;
172     HWM = e.HWM;
173     partition = e.partition;
174     partitionNum = e.partitionNum;
175     segmentNum   = e.segmentNum;
176     dbRoot       = e.dbRoot;
177     colWid       = e.colWid;
178     status		= e.status;
179 }
180 
operator =(const EMEntry & e)181 EMEntry& EMEntry::operator= (const EMEntry& e)
182 {
183     range.start = e.range.start;
184     range.size = e.range.size;
185     fileID = e.fileID;
186     blockOffset = e.blockOffset;
187     HWM = e.HWM;
188     partition = e.partition;
189     partitionNum = e.partitionNum;
190     segmentNum   = e.segmentNum;
191     colWid       = e.colWid;
192     dbRoot       = e.dbRoot;
193     status		= e.status;
194     return *this;
195 }
196 
operator <(const EMEntry & e) const197 bool EMEntry::operator< (const EMEntry& e) const
198 {
199     if (range.start < e.range.start)
200         return true;
201 
202     return false;
203 }
204 
205 /*static*/
206 boost::mutex ExtentMapImpl::fInstanceMutex;
207 boost::mutex ExtentMap::mutex;
208 
209 /*static*/
210 ExtentMapImpl* ExtentMapImpl::fInstance = 0;
211 
212 /*static*/
makeExtentMapImpl(unsigned key,off_t size,bool readOnly)213 ExtentMapImpl* ExtentMapImpl::makeExtentMapImpl(unsigned key, off_t size, bool readOnly)
214 {
215     boost::mutex::scoped_lock lk(fInstanceMutex);
216 
217     if (fInstance)
218     {
219         if (key != fInstance->fExtMap.key())
220         {
221             BRMShmImpl newShm(key, 0);
222             fInstance->swapout(newShm);
223         }
224 
225         ASSERT(key == fInstance->fExtMap.key());
226         return fInstance;
227     }
228 
229     fInstance = new ExtentMapImpl(key, size, readOnly);
230 
231     return fInstance;
232 }
233 
ExtentMapImpl(unsigned key,off_t size,bool readOnly)234 ExtentMapImpl::ExtentMapImpl(unsigned key, off_t size, bool readOnly) :
235     fExtMap(key, size, readOnly)
236 {
237 }
238 
239 /*static*/
240 boost::mutex FreeListImpl::fInstanceMutex;
241 
242 /*static*/
243 FreeListImpl* FreeListImpl::fInstance = 0;
244 
245 /*static*/
makeFreeListImpl(unsigned key,off_t size,bool readOnly)246 FreeListImpl* FreeListImpl::makeFreeListImpl(unsigned key, off_t size, bool readOnly)
247 {
248     boost::mutex::scoped_lock lk(fInstanceMutex);
249 
250     if (fInstance)
251     {
252         if (key != fInstance->fFreeList.key())
253         {
254             BRMShmImpl newShm(key, 0);
255             fInstance->swapout(newShm);
256         }
257 
258         ASSERT(key == fInstance->fFreeList.key());
259         return fInstance;
260     }
261 
262     fInstance = new FreeListImpl(key, size, readOnly);
263 
264     return fInstance;
265 }
266 
FreeListImpl(unsigned key,off_t size,bool readOnly)267 FreeListImpl::FreeListImpl(unsigned key, off_t size, bool readOnly) :
268     fFreeList(key, size, readOnly)
269 {
270 }
271 
ExtentMap()272 ExtentMap::ExtentMap()
273 {
274     fExtentMap = NULL;
275     fFreeList = NULL;
276     fCurrentEMShmkey = -1;
277     fCurrentFLShmkey = -1;
278     fEMShminfo = NULL;
279     fFLShminfo = NULL;
280     r_only = false;
281     flLocked = false;
282     emLocked = false;
283     fPExtMapImpl = 0;
284     fPFreeListImpl = 0;
285 
286 #ifdef BRM_INFO
287     fDebug = ("Y" == config::Config::makeConfig()->getConfig("DBRM", "Debug"));
288 #endif
289 }
290 
~ExtentMap()291 ExtentMap::~ExtentMap()
292 {
293     PmDbRootMap_t::iterator iter = fPmDbRootMap.begin();
294     PmDbRootMap_t::iterator end = fPmDbRootMap.end();
295 
296     while (iter != end)
297     {
298         delete iter->second;
299         iter->second = 0;
300         ++iter;
301     }
302 
303     fPmDbRootMap.clear();
304 }
305 
306 // Casual Partioning support
307 //
308 
309 /**
310 * @brief mark the max/min values of an extent as invalid
311 *
312 * mark the extent containing the lbid as invalid and
313 * increment the sequenceNum value. If the lbid is found
314 * in the extent map a 0 is returned otherwise a 1.
315 *
316 **/
317 
_markInvalid(const LBID_t lbid,const execplan::CalpontSystemCatalog::ColDataType colDataType)318 int ExtentMap::_markInvalid(const LBID_t lbid, const execplan::CalpontSystemCatalog::ColDataType colDataType)
319 {
320     int entries;
321     int i;
322     LBID_t lastBlock;
323 
324     entries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
325 
326     for (i = 0; i < entries; i++)
327     {
328         lastBlock = fExtentMap[i].range.start +
329                     (static_cast<LBID_t>(fExtentMap[i].range.size) * 1024) - 1;
330 
331         if (fExtentMap[i].range.size != 0)
332         {
333             if (lbid >= fExtentMap[i].range.start && lbid <= lastBlock)
334             {
335                 makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry));
336                 fExtentMap[i].partition.cprange.isValid = CP_UPDATING;
337 
338                 if (isUnsigned(colDataType))
339                 {
340                     fExtentMap[i].partition.cprange.lo_val = numeric_limits<uint64_t>::max();
341                     fExtentMap[i].partition.cprange.hi_val = 0;
342                 }
343                 else
344                 {
345                     fExtentMap[i].partition.cprange.lo_val = numeric_limits<int64_t>::max();
346                     fExtentMap[i].partition.cprange.hi_val = numeric_limits<int64_t>::min();
347                 }
348 
349                 incSeqNum(fExtentMap[i].partition.cprange.sequenceNum);
350 #ifdef BRM_DEBUG
351                 ostringstream os;
352                 os << "ExtentMap::_markInvalid(): casual partitioning update: firstLBID=" <<
353                    fExtentMap[i].range.start << " lastLBID=" << fExtentMap[i].range.start +
354                    fExtentMap[i].range.size * 1024 - 1 << " OID=" << fExtentMap[i].fileID <<
355                    " min=" << fExtentMap[i].partition.cprange.lo_val <<
356                    " max=" << fExtentMap[i].partition.cprange.hi_val <<
357                    "seq=" << fExtentMap[i].partition.cprange.sequenceNum;
358                 log(os.str(), logging::LOG_TYPE_DEBUG);
359 #endif
360                 return 0;
361             }
362         }
363     }
364 
365     throw logic_error("ExtentMap::markInvalid(): lbid isn't allocated");
366 }
367 
markInvalid(const LBID_t lbid,const execplan::CalpontSystemCatalog::ColDataType colDataType)368 int ExtentMap::markInvalid(const LBID_t lbid,
369                            const execplan::CalpontSystemCatalog::ColDataType colDataType)
370 {
371 #ifdef BRM_DEBUG
372 
373     if (lbid < 0)
374         throw invalid_argument("ExtentMap::markInvalid(): lbid must be >= 0");
375 
376 #endif
377 #ifdef BRM_INFO
378 
379     if (fDebug)
380     {
381         TRACER_WRITELATER("_markInvalid");
382         TRACER_ADDINPUT(lbid);
383         TRACER_WRITE;
384     }
385 
386 #endif
387 
388 #ifdef BRM_DEBUG
389     ostringstream os;
390     os << "ExtentMap::markInvalid(" << lbid << "," << colDataType << ")";
391     log(os.str(), logging::LOG_TYPE_DEBUG);
392 #endif
393 
394     grabEMEntryTable(WRITE);
395     return _markInvalid(lbid, colDataType);
396 }
397 
398 /**
399 * @brief calls markInvalid(LBID_t lbid) for each extent containing any lbid in vector<LBID_t>& lbids
400 *
401 **/
402 
markInvalid(const vector<LBID_t> & lbids,const vector<execplan::CalpontSystemCatalog::ColDataType> & colDataTypes)403 int ExtentMap::markInvalid(const vector<LBID_t>& lbids,
404                            const vector<execplan::CalpontSystemCatalog::ColDataType>& colDataTypes)
405 {
406     uint32_t i, size = lbids.size();
407 
408 #ifdef BRM_DEBUG
409 
410     for (i = 0; i < size; ++i)
411         if (lbids[i] < 0)
412             throw invalid_argument("ExtentMap::markInvalid(vector): all lbids must be >= 0");
413 
414 #endif
415 #ifdef BRM_INFO
416 
417     if (fDebug)
418     {
419         TRACER_WRITELATER("_markInvalid");
420         TRACER_ADDINPUT(size);
421         TRACER_WRITE;
422     }
423 
424 #endif
425 
426     grabEMEntryTable(WRITE);
427 
428     // XXXPAT: what's the proper return code when one and only one fails?
429     for (i = 0; i < size; ++i)
430     {
431 #ifdef BRM_DEBUG
432         ostringstream os;
433         os << "ExtentMap::markInvalid() lbids[" << i << "]=" << lbids[i] <<
434            " colDataTypes[" << i << "]=" << colDataTypes[i];
435         log(os.str(), logging::LOG_TYPE_DEBUG);
436 #endif
437 
438         try
439         {
440             _markInvalid(lbids[i], colDataTypes[i]);
441         }
442         catch (std::exception& e)
443         {
444             cerr << "ExtentMap::markInvalid(vector): warning!  lbid " << lbids[i] <<
445                  " caused " << e.what() << endl;
446         }
447     }
448 
449     return 0;
450 }
451 
452 /**
453 * @brief set the max/min values for the extent if the seqNum matches the extents sequenceNum
454 *
455 * reset the lbid's hi_val to max and lo_val to min
456 * the seqNum matches the ExtentMap.sequenceNum. Then increments
457 * the current sequenceNum value by 1. If the sequenceNum does not
458 * match the seqNum value do not update the lbid's max/min values
459 * or increment the sequenceNum value and return a -1.
460 
461 **/
462 
setMaxMin(const LBID_t lbid,const int64_t max,const int64_t min,const int32_t seqNum,bool firstNode)463 int ExtentMap::setMaxMin(const LBID_t lbid,
464                          const int64_t max,
465                          const int64_t min,
466                          const int32_t seqNum,
467                          bool firstNode)
468 {
469 #ifdef BRM_INFO
470 
471     if (fDebug)
472     {
473         TRACER_WRITELATER("updateMaxMin");
474         TRACER_ADDINPUT(lbid);
475         TRACER_ADDINPUT(max);
476         TRACER_ADDINPUT(min);
477         TRACER_ADDINPUT(seqNum);
478         TRACER_WRITE;
479     }
480 
481 #endif
482     int entries;
483     int i;
484     LBID_t lastBlock;
485     int32_t curSequence;
486 
487 #ifdef BRM_DEBUG
488 
489     if (lbid < 0)
490         throw invalid_argument("ExtentMap::setMaxMin(): lbid must be >= 0");
491 
492 #endif
493 
494     grabEMEntryTable(WRITE);
495     entries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
496 
497     for (i = 0; i < entries; i++)
498     {
499         if (fExtentMap[i].range.size != 0)
500         {
501             lastBlock = fExtentMap[i].range.start +
502                         (static_cast<LBID_t>(fExtentMap[i].range.size) * 1024) - 1;
503             curSequence = fExtentMap[i].partition.cprange.sequenceNum;
504 
505             if (lbid >= fExtentMap[i].range.start && lbid <= lastBlock)
506             {
507 #ifdef BRM_DEBUG
508 
509                 if (firstNode)
510                 {
511                     ostringstream os;
512                     os << "ExtentMap::setMaxMin(): casual partitioning update: firstLBID=" <<
513                        fExtentMap[i].range.start << " lastLBID=" << fExtentMap[i].range.start +
514                        fExtentMap[i].range.size * 1024 - 1 << " OID=" << fExtentMap[i].fileID <<
515                        " min=" << min << " max=" << max << "seq=" << seqNum;
516                     log(os.str(), logging::LOG_TYPE_DEBUG);
517                 }
518 
519 #endif
520 
521                 if (curSequence == seqNum)
522                 {
523                     makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry));
524                     fExtentMap[i].partition.cprange.hi_val = max;
525                     fExtentMap[i].partition.cprange.lo_val = min;
526                     fExtentMap[i].partition.cprange.isValid = CP_VALID;
527                     incSeqNum(fExtentMap[i].partition.cprange.sequenceNum);
528                     return 0;
529                 }
530                 //special val to indicate a reset--used by editem -c.
531                 //Also used by COMMIT and ROLLBACK to invalidate CP.
532                 else if (seqNum == -1)
533                 {
534                     makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry));
535                     // We set hi_val and lo_val to correct values for signed or unsigned
536                     // during the markinvalid step, which sets the invalid variable to CP_UPDATING.
537                     // During this step (seqNum == -1), the min and max passed in are not reliable
538                     // and should not be used.
539                     fExtentMap[i].partition.cprange.isValid = CP_INVALID;
540                     incSeqNum(fExtentMap[i].partition.cprange.sequenceNum);
541                     return 0;
542                 }
543                 else
544                 {
545                     return 0;
546                 }
547             }
548         }
549     }
550 
551     if (emLocked)
552         releaseEMEntryTable(WRITE);
553 
554     throw logic_error("ExtentMap::setMaxMin(): lbid isn't allocated");
555 // 	return -1;
556 }
557 
558 // @bug 1970.  Added updateExtentsMaxMin function.
559 // @note - The key passed in the map must the the first LBID in the extent.
setExtentsMaxMin(const CPMaxMinMap_t & cpMap,bool firstNode,bool useLock)560 void ExtentMap::setExtentsMaxMin(const CPMaxMinMap_t& cpMap, bool firstNode, bool useLock)
561 {
562     CPMaxMinMap_t::const_iterator it;
563 
564 #ifdef BRM_DEBUG
565     log("ExtentMap::setExtentsMaxMin()", logging::LOG_TYPE_DEBUG);
566 
567     for (it = cpMap.begin(); it != cpMap.end(); ++it)
568     {
569         ostringstream os;
570         os << "FirstLBID=" << it->first <<
571            " min=" << it->second.min <<
572            " max=" << it->second.max <<
573            " seq=" << it->second.seqNum;
574         log(os.str(), logging::LOG_TYPE_DEBUG);
575     }
576 
577 #endif
578 
579 
580 #ifdef BRM_INFO
581 
582     if (fDebug)
583     {
584         TRACER_WRITELATER("setExtentsMaxMin");
585 
586         for (it = cpMap.begin(); it != cpMap.end(); ++it)
587         {
588             TRACER_ADDINPUT((*it).first);
589             TRACER_ADDINPUT((*it).second.max);
590             TRACER_ADDINPUT((*it).second.min);
591             TRACER_ADDINPUT((*it).second.seqNum);
592             TRACER_WRITE;
593         }
594     }
595 
596 #endif
597     int entries;
598     int i;
599     int32_t curSequence;
600     const int32_t extentsToUpdate = cpMap.size();
601     int32_t extentsUpdated = 0;
602 
603 #ifdef BRM_DEBUG
604 
605     if (extentsToUpdate <= 0)
606         throw invalid_argument("ExtentMap::setExtentsMaxMin(): cpMap must be populated");
607 
608 #endif
609 
610     if (useLock)
611         grabEMEntryTable(WRITE);
612 
613     entries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
614 
615     for (i = 0; i < entries; i++)
616     {
617         if (fExtentMap[i].range.size != 0)
618         {
619             it = cpMap.find(fExtentMap[i].range.start);
620 
621             if (it != cpMap.end())
622             {
623                 curSequence = fExtentMap[i].partition.cprange.sequenceNum;
624 
625                 if (curSequence == it->second.seqNum &&
626                         fExtentMap[i].partition.cprange.isValid == CP_INVALID)
627                 {
628                     makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry));
629                     fExtentMap[i].partition.cprange.hi_val = it->second.max;
630                     fExtentMap[i].partition.cprange.lo_val = it->second.min;
631                     fExtentMap[i].partition.cprange.isValid = CP_VALID;
632                     incSeqNum(fExtentMap[i].partition.cprange.sequenceNum);
633                     extentsUpdated++;
634 #ifdef BRM_DEBUG
635 
636                     if (firstNode)
637                     {
638                         ostringstream os;
639                         os << "ExtentMap::setExtentsMaxMin(): casual partitioning update: firstLBID=" <<
640                            fExtentMap[i].range.start << " lastLBID=" << fExtentMap[i].range.start +
641                            fExtentMap[i].range.size * 1024 - 1 << " OID=" << fExtentMap[i].fileID <<
642                            " min=" << it->second.min << " max=" <<
643                            it->second.max << " seq=" <<
644                            it->second.seqNum;
645                         log(os.str(), logging::LOG_TYPE_DEBUG);
646                     }
647 
648 #endif
649                 }
650                 //special val to indicate a reset -- ignore the min/max
651                 else if (it->second.seqNum == -1)
652                 {
653                     makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry));
654                     // We set hi_val and lo_val to correct values for signed or unsigned
655                     // during the markinvalid step, which sets the invalid variable to CP_UPDATING.
656                     // During this step (seqNum == -1), the min and max passed in are not reliable
657                     // and should not be used.
658                     fExtentMap[i].partition.cprange.isValid = CP_INVALID;
659                     incSeqNum(fExtentMap[i].partition.cprange.sequenceNum);
660                     extentsUpdated++;
661                 }
662                 //special val to indicate a reset -- assign the min/max
663                 else if (it->second.seqNum == -2)
664                 {
665                     makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry));
666                     fExtentMap[i].partition.cprange.hi_val = it->second.max;
667                     fExtentMap[i].partition.cprange.lo_val = it->second.min;
668                     fExtentMap[i].partition.cprange.isValid = CP_INVALID;
669                     incSeqNum(fExtentMap[i].partition.cprange.sequenceNum);
670                     extentsUpdated++;
671                 }
672                 // else sequence has changed since start of the query.  Don't update the EM entry.
673                 else
674                 {
675                     extentsUpdated++;
676                 }
677 
678                 if (extentsUpdated == extentsToUpdate)
679                 {
680                     return;
681                 }
682             }
683         }
684     }
685 
686     throw logic_error("ExtentMap::setExtentsMaxMin(): lbid isn't allocated");
687 }
688 
689 //------------------------------------------------------------------------------
690 // @bug 1970.  Added mergeExtentsMaxMin to merge CP info for list of extents.
691 // @note - The key passed in the map must the starting LBID in the extent.
692 // Used by cpimport to update extentmap casual partition min/max.
693 // NULL or empty values should not be passed in as min/max values.
694 // seqNum in the input struct is not currently used.
695 //
696 // Note that DML calls markInvalid() to flag an extent as CP_UPDATING and incre-
697 // ments the sequence number prior to any change, and then marks the extent as
698 // CP_INVALID at transaction's end.
699 // Since cpimport locks the entire table prior to making any changes, it is
700 // assumed that the state of an extent will not be changed (by anyone else)
701 // during an import; so cpimport does not employ the intermediate CP_UPDATING
702 // state that DML uses.  cpimport just waits till the end of the job and incre-
703 // ments the sequence number and changes the state to CP_INVALID at that time.
704 // We may want/need to reconsider this at some point.
705 //------------------------------------------------------------------------------
mergeExtentsMaxMin(CPMaxMinMergeMap_t & cpMap,bool useLock)706 void ExtentMap::mergeExtentsMaxMin(CPMaxMinMergeMap_t& cpMap, bool useLock)
707 {
708     CPMaxMinMergeMap_t::const_iterator it;
709 
710 #ifdef BRM_DEBUG
711     log("ExtentMap::mergeExtentsMaxMin()", logging::LOG_TYPE_DEBUG);
712 
713     for (it = cpMap.begin(); it != cpMap.end(); ++it)
714     {
715         ostringstream os;
716         os << "FirstLBID=" << it->first <<
717            " min=" << it->second.min <<
718            " max=" << it->second.max <<
719            " seq=" << it->second.seqNum <<
720            " typ: " << (*it).second.type <<
721            " new: " << (*it).second.newExtent;
722         log(os.str(), logging::LOG_TYPE_DEBUG);
723     }
724 
725 #endif
726 
727 #ifdef BRM_INFO
728 
729     if (fDebug)
730     {
731         TRACER_WRITENOW("mergeExtentsMaxMin");
732         unsigned int count = 1;
733 
734         for (it = cpMap.begin(); it != cpMap.end(); ++it)
735         {
736             ostringstream oss;
737             oss << "  "   << count                <<
738                 ". LBID: " << (*it).first          <<
739                 "; max: " << (*it).second.max     <<
740                 "; min: " << (*it).second.min     <<
741                 "; seq: " << (*it).second.seqNum  <<
742                 "; typ: " << (*it).second.type    <<
743                 "; new: " << (*it).second.newExtent;
744             TRACER_WRITEDIRECT(oss.str());
745             count++;
746         }
747     }
748 
749 #endif
750 
751     const int32_t extentsToMerge = cpMap.size();
752     int32_t extentsMerged = 0;
753 
754 #ifdef BRM_DEBUG
755 
756     if (extentsToMerge <= 0)
757         throw invalid_argument("ExtentMap::mergeExtentsMaxMin(): "
758                                "cpMap must be populated");
759 
760 #endif
761 
762     if (useLock)
763         grabEMEntryTable(WRITE);
764 
765     int entries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
766 
767     for (int i = 0; i < entries; i++)  			// loop through all extents
768     {
769         if (fExtentMap[i].range.size != 0)  	// find eligible extents
770         {
771             it = cpMap.find(fExtentMap[i].range.start);
772 
773             if (it != cpMap.end())
774             {
775 #ifdef BRM_DEBUG
776                 ostringstream os;
777                 os << "ExtentMap::mergeExtentsMaxMin(): casual partitioning update: firstLBID=" <<
778                    fExtentMap[i].range.start << " lastLBID=" << fExtentMap[i].range.start +
779                    fExtentMap[i].range.size * 1024 - 1 << " OID=" << fExtentMap[i].fileID <<
780                    " hi_val=" << fExtentMap[i].partition.cprange.hi_val <<
781                    " lo_val=" << fExtentMap[i].partition.cprange.lo_val <<
782                    " min=" << it->second.min << " max=" << it->second.max <<
783                    " seq=" << it->second.seqNum;
784                 log(os.str(), logging::LOG_TYPE_DEBUG);
785 #endif
786 
787                 switch (fExtentMap[i].partition.cprange.isValid)
788                 {
789                     // Merge input min/max with current min/max
790                     case CP_VALID:
791                     {
792                         if (!isValidCPRange( it->second.max,
793                                              it->second.min,
794                                              it->second.type ))
795                         {
796                             break;
797                         }
798 
799                         makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry));
800 
801                         // We check the validity of the current min/max,
802                         // because isValid could be CP_VALID for an extent
803                         // having all NULL values, in which case the current
804                         // min/max needs to be set instead of merged.
805 
806                         if (isValidCPRange(
807                                     fExtentMap[i].partition.cprange.hi_val,
808                                     fExtentMap[i].partition.cprange.lo_val,
809                                     it->second.type))
810                         {
811                             // Swap byte order to do binary string comparison
812                             if (isCharType(it->second.type))
813                             {
814                                 int64_t newMinVal =
815                                     static_cast<int64_t>( uint64ToStr(
816                                                               static_cast<uint64_t>(it->second.min)));
817                                 int64_t newMaxVal =
818                                     static_cast<int64_t>( uint64ToStr(
819                                                               static_cast<uint64_t>(it->second.max)));
820                                 int64_t oldMinVal =
821                                     static_cast<int64_t>( uint64ToStr(
822                                                               static_cast<uint64_t>(
823                                                                   fExtentMap[i].partition.cprange.lo_val)) );
824                                 int64_t oldMaxVal =
825                                     static_cast<int64_t>( uint64ToStr(
826                                                               static_cast<uint64_t>(
827                                                                   fExtentMap[i].partition.cprange.hi_val)) );
828 
829                                 if (newMinVal < oldMinVal)
830                                     fExtentMap[i].partition.cprange.lo_val =
831                                         it->second.min;
832 
833                                 if (newMaxVal > oldMaxVal)
834                                     fExtentMap[i].partition.cprange.hi_val =
835                                         it->second.max;
836                             }
837                             else if (isUnsigned(it->second.type))
838                             {
839                                 if (static_cast<uint64_t>(it->second.min) <
840                                         static_cast<uint64_t>(fExtentMap[i].partition.cprange.lo_val))
841                                 {
842                                     fExtentMap[i].partition.cprange.lo_val =
843                                         it->second.min;
844                                 }
845 
846                                 if (static_cast<uint64_t>(it->second.max) >
847                                         static_cast<uint64_t>(fExtentMap[i].partition.cprange.hi_val))
848                                 {
849                                     fExtentMap[i].partition.cprange.hi_val =
850                                         it->second.max;
851                                 }
852                             }
853                             else
854                             {
855                                 if (it->second.min <
856                                         fExtentMap[i].partition.cprange.lo_val)
857                                     fExtentMap[i].partition.cprange.lo_val =
858                                         it->second.min;
859 
860                                 if (it->second.max >
861                                         fExtentMap[i].partition.cprange.hi_val)
862                                     fExtentMap[i].partition.cprange.hi_val =
863                                         it->second.max;
864                             }
865                         }
866                         else
867                         {
868                             fExtentMap[i].partition.cprange.lo_val =
869                                 it->second.min;
870                             fExtentMap[i].partition.cprange.hi_val =
871                                 it->second.max;
872                         }
873 
874                         incSeqNum(fExtentMap[i].partition.cprange.sequenceNum);
875 
876                         break;
877                     }
878 
879                     // DML is updating; just increment seqnum.
880                     // This case is here for completeness.  Table lock should
881                     // prevent this state from occurring (see notes at top of
882                     // this function)
883                     case CP_UPDATING:
884                     {
885                         makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry));
886                         incSeqNum(fExtentMap[i].partition.cprange.sequenceNum);
887 
888                         break;
889                     }
890 
891                     // Reset min/max to new min/max only "if" we can treat this
892                     // as a new extent, else leave the extent marked as INVALID
893                     case CP_INVALID:
894                     default:
895                     {
896                         makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry));
897 
898                         if (it->second.newExtent)
899                         {
900                             if (isValidCPRange( it->second.max,
901                                                 it->second.min,
902                                                 it->second.type ))
903                             {
904                                 fExtentMap[i].partition.cprange.lo_val =
905                                     it->second.min;
906                                 fExtentMap[i].partition.cprange.hi_val =
907                                     it->second.max;
908                             }
909 
910                             // Even if invalid range; we set state to CP_VALID,
911                             // because the extent is valid, it is just empty.
912                             fExtentMap[i].partition.cprange.isValid = CP_VALID;
913                         }
914 
915                         incSeqNum(fExtentMap[i].partition.cprange.sequenceNum);
916                         break;
917                     }
918                 }	// switch on isValid state
919 
920                 extentsMerged++;
921 
922                 if (extentsMerged == extentsToMerge)
923                 {
924                     return; // Leave when all extents in map are matched
925                 }
926 
927                 // Deleting objects from map, may speed up successive searches
928                 cpMap.erase( it );
929 
930             }	// found a matching extent in the Map
931         }	// extent map range size != 0
932     }	// end of loop through extent map
933 
934     throw logic_error("ExtentMap::mergeExtentsMaxMin(): lbid not found");
935 }
936 
937 //------------------------------------------------------------------------------
938 // Use this function to see if the range is a valid min/max range or not.
939 // Range is considered invalid if min or max, are NULL (min()), or EMPTY
940 // (min()+1). For unsigned types NULL is max() and EMPTY is max()-1.
941 //------------------------------------------------------------------------------
isValidCPRange(int64_t max,int64_t min,execplan::CalpontSystemCatalog::ColDataType type) const942 bool ExtentMap::isValidCPRange(int64_t max, int64_t min, execplan::CalpontSystemCatalog::ColDataType type) const
943 {
944     if (isUnsigned(type))
945     {
946         if ( (static_cast<uint64_t>(min) >= (numeric_limits<uint64_t>::max() - 1)) ||
947                 (static_cast<uint64_t>(max) >= (numeric_limits<uint64_t>::max() - 1)) )
948         {
949             return false;
950         }
951     }
952     else
953     {
954         if ( (min <= (numeric_limits<int64_t>::min() + 1)) ||
955                 (max <= (numeric_limits<int64_t>::min() + 1)) )
956         {
957             return false;
958         }
959     }
960 
961     return true;
962 }
963 
964 /**
965 * @brief retrieve the hi_val and lo_val or sequenceNum of the extent containing the LBID lbid.
966 *
967 * For the extent containing the LBID lbid, return the max/min values if the extent range values
968 * are valid and a -1 in the seqNum parameter. If the range values are flaged as invalid
969 * return the sequenceNum of the extent and the max/min values as -1.
970 **/
971 
getMaxMin(const LBID_t lbid,int64_t & max,int64_t & min,int32_t & seqNum)972 int ExtentMap::getMaxMin(const LBID_t lbid,
973                          int64_t& max,
974                          int64_t& min,
975                          int32_t& seqNum)
976 {
977 #ifdef BRM_INFO
978 
979     if (fDebug)
980     {
981         TRACER_WRITELATER("getMaxMin");
982         TRACER_ADDINPUT(lbid);
983         TRACER_ADDOUTPUT(max);
984         TRACER_ADDOUTPUT(min);
985         TRACER_ADDOUTPUT(seqNum);
986         TRACER_WRITE;
987     }
988 
989 #endif
990     max = numeric_limits<uint64_t>::max();
991     min = 0;
992     seqNum *= (-1);
993     int entries;
994     int i;
995     LBID_t lastBlock;
996     int isValid = CP_INVALID;
997 
998 #ifdef BRM_DEBUG
999 
1000     if (lbid < 0)
1001         throw invalid_argument("ExtentMap::getMaxMin(): lbid must be >= 0");
1002 
1003 #endif
1004 
1005     grabEMEntryTable(READ);
1006     entries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
1007 
1008     for (i = 0; i < entries; i++)
1009     {
1010         if (fExtentMap[i].range.size != 0)
1011         {
1012             lastBlock = fExtentMap[i].range.start +
1013                         (static_cast<LBID_t>(fExtentMap[i].range.size) * 1024) - 1;
1014 
1015             if (lbid >= fExtentMap[i].range.start && lbid <= lastBlock)
1016             {
1017                 max = fExtentMap[i].partition.cprange.hi_val;
1018                 min = fExtentMap[i].partition.cprange.lo_val;
1019                 seqNum = fExtentMap[i].partition.cprange.sequenceNum;
1020                 isValid = fExtentMap[i].partition.cprange.isValid;
1021                 releaseEMEntryTable(READ);
1022                 return isValid;
1023             }
1024         }
1025     }
1026 
1027     releaseEMEntryTable(READ);
1028     throw logic_error("ExtentMap::getMaxMin(): that lbid isn't allocated");
1029 //   	return -1;
1030 }
1031 
1032 /* Removes a range from the freelist.  Used by load() */
reserveLBIDRange(LBID_t start,uint8_t size)1033 void ExtentMap::reserveLBIDRange(LBID_t start, uint8_t size)
1034 {
1035     int i;
1036     int flEntries = fFLShminfo->allocdSize / sizeof(InlineLBIDRange);
1037     LBID_t lastLBID = start + (size * 1024) - 1;
1038     int32_t freeIndex = -1;
1039 
1040     /* Find a range the request intersects.  There should be one and only one. */
1041     for (i = 0; i < flEntries; i++)
1042     {
1043         LBID_t eLastLBID;
1044 
1045         // while scanning, grab the first free slot
1046         if (fFreeList[i].size == 0)
1047         {
1048             if (freeIndex == -1)
1049                 freeIndex = i;
1050 
1051             continue;
1052         }
1053 
1054         eLastLBID = fFreeList[i].start + (((int64_t) fFreeList[i].size) * 1024) - 1;
1055 
1056         /* if it's at the front... */
1057         if (start == fFreeList[i].start)
1058         {
1059             /* if the request is larger than the freelist entry -> implies an extent
1060              * overlap.  This is debugging code. */
1061             //idbassert(size > fFreeList[i].size);
1062             makeUndoRecord(&fFreeList[i], sizeof(InlineLBIDRange));
1063             fFreeList[i].start += size * 1024;
1064             fFreeList[i].size -= size;
1065 
1066             if (fFreeList[i].size == 0)
1067             {
1068                 makeUndoRecord(fFLShminfo, sizeof(MSTEntry));
1069                 fFLShminfo->currentSize -= sizeof(InlineLBIDRange);
1070             }
1071 
1072             break;
1073         }
1074         /* if it's at the back... */
1075         else if (eLastLBID == lastLBID)
1076         {
1077             makeUndoRecord(&fFreeList[i], sizeof(InlineLBIDRange));
1078             fFreeList[i].size -= size;
1079 
1080             if (fFreeList[i].size == 0)
1081             {
1082                 makeUndoRecord(fFLShminfo, sizeof(MSTEntry));
1083                 fFLShminfo->currentSize -= sizeof(InlineLBIDRange);
1084             }
1085 
1086             break;
1087             /* This entry won't be the same size as the request or the first
1088              * clause would have run instead.
1089              */
1090         }
1091         /* if it's in the middle... */
1092         /* break it into two elements */
1093         else if (fFreeList[i].start < start && eLastLBID > lastLBID)
1094         {
1095             if (freeIndex == -1)
1096             {
1097                 if (fFLShminfo->currentSize == fFLShminfo->allocdSize)
1098                 {
1099                     growFLShmseg();
1100                     freeIndex = flEntries;
1101                 }
1102                 else
1103                     for (freeIndex = i + 1; freeIndex < flEntries; freeIndex++)
1104                         if (fFreeList[freeIndex].size == 0)
1105                             break;
1106 
1107 #ifdef BRM_DEBUG
1108                 idbassert(nextIndex < flEntries);
1109 #endif
1110             }
1111 
1112             makeUndoRecord(&fFreeList[i], sizeof(InlineLBIDRange));
1113             makeUndoRecord(&fFreeList[freeIndex], sizeof(InlineLBIDRange));
1114             makeUndoRecord(fFLShminfo, sizeof(MSTEntry));
1115             fFreeList[i].size = (start - fFreeList[i].start) / 1024;
1116             fFreeList[freeIndex].start = start + (size * 1024);
1117             fFreeList[freeIndex].size = (eLastLBID - lastLBID) / 1024;
1118             fFLShminfo->currentSize += sizeof(InlineLBIDRange);
1119             break;
1120         }
1121     }
1122 }
1123 
1124 /*
1125 	The file layout looks like this:
1126 
1127 	  EM Magic (32-bits)
1128 	  number of EM entries  (32-bits)
1129 	  number of FL entries  (32-bits)
1130 	  EMEntry
1131 	    ...   (* numEM)
1132 	  struct InlineLBIDRange
1133 	    ...   (* numFL)
1134 */
1135 
1136 
loadVersion4(IDBDataFile * in)1137 void ExtentMap::loadVersion4(IDBDataFile* in)
1138 {
1139     int emNumElements = 0, flNumElements = 0;
1140 
1141     int nbytes = 0;
1142     nbytes += in->read((char*) &emNumElements, sizeof(int));
1143     nbytes += in->read((char*) &flNumElements, sizeof(int));
1144     idbassert(emNumElements > 0);
1145 
1146     if ((size_t) nbytes != sizeof(int) + sizeof(int))
1147     {
1148         log_errno("ExtentMap::loadVersion4(): read ");
1149         throw runtime_error("ExtentMap::loadVersion4(): read failed. Check the error log.");
1150     }
1151 
1152     void *fExtentMapPtr = static_cast<void*>(fExtentMap);
1153     memset(fExtentMapPtr, 0, fEMShminfo->allocdSize);
1154     fEMShminfo->currentSize = 0;
1155 
1156     // init the free list
1157     memset(fFreeList, 0, fFLShminfo->allocdSize);
1158     fFreeList[0].size = (1 << 26);   // 2^36 LBIDs
1159     fFLShminfo->currentSize = sizeof(InlineLBIDRange);
1160 
1161     // @Bug 3498
1162     // Calculate how big an extent map we're going to need and allocate it in one call
1163     if ((fEMShminfo->allocdSize / sizeof(EMEntry)) < (unsigned)emNumElements)
1164     {
1165         size_t nrows = emNumElements;
1166 
1167         //Round up to the nearest EM_INCREMENT_ROWS
1168         if ((nrows % EM_INCREMENT_ROWS) != 0)
1169         {
1170             nrows /= EM_INCREMENT_ROWS;
1171             nrows++;
1172             nrows *= EM_INCREMENT_ROWS;
1173         }
1174 
1175         growEMShmseg(nrows);
1176     }
1177 
1178     size_t progress = 0, writeSize = emNumElements * sizeof(EMEntry);
1179     int err;
1180     char *writePos = (char *) fExtentMap;
1181     while (progress < writeSize)
1182     {
1183         err = in->read(writePos + progress, writeSize - progress);
1184         if (err <= 0)
1185         {
1186             log_errno("ExtentMap::loadVersion4(): read ");
1187             throw runtime_error("ExtentMap::loadVersion4(): read failed. Check the error log.");
1188         }
1189         progress += (uint) err;
1190     }
1191 
1192     for (int i = 0; i < emNumElements; i++)
1193     {
1194         reserveLBIDRange(fExtentMap[i].range.start, fExtentMap[i].range.size);
1195 
1196         //@bug 1911 - verify status value is valid
1197         if (fExtentMap[i].status < EXTENTSTATUSMIN ||
1198                 fExtentMap[i].status > EXTENTSTATUSMAX)
1199             fExtentMap[i].status = EXTENTAVAILABLE;
1200     }
1201 
1202     fEMShminfo->currentSize = emNumElements * sizeof(EMEntry);
1203 
1204 #ifdef DUMP_EXTENT_MAP
1205     EMEntry* emSrc = fExtentMap;
1206     cout << "lbid\tsz\toid\tfbo\thwm\tpart#\tseg#\tDBRoot\twid\tst\thi\tlo\tsq\tv" << endl;
1207 
1208     for (int i = 0; i < emNumElements; i++)
1209     {
1210         cout <<
1211              emSrc[i].start
1212              << '\t' << emSrc[i].size
1213              << '\t' << emSrc[i].fileID
1214              << '\t' << emSrc[i].blockOffset
1215              << '\t' << emSrc[i].HWM
1216              << '\t' << emSrc[i].partitionNum
1217              << '\t' << emSrc[i].segmentNum
1218              << '\t' << emSrc[i].dbRoot
1219              << '\t' << emSrc[i].status
1220              << '\t' << emSrc[i].partition.cprange.hi_val
1221              << '\t' << emSrc[i].partition.cprange.lo_val
1222              << '\t' << emSrc[i].partition.cprange.sequenceNum
1223              << '\t' << (int)(emSrc[i].partition.cprange.isValid)
1224              << endl;
1225     }
1226 
1227     cout << "Free list entries:" << endl;
1228     cout << "start\tsize" << endl;
1229 
1230     for (int i = 0; i < flNumElements; i++)
1231         cout << fFreeList[i].start << '\t' << fFreeList[i].size << endl;
1232 
1233 #endif
1234 }
1235 
load(const string & filename,bool fixFL)1236 void ExtentMap::load(const string& filename, bool fixFL)
1237 {
1238 #ifdef BRM_INFO
1239 
1240     if (fDebug)
1241     {
1242         TRACER_WRITELATER("load");
1243         TRACER_ADDSTRINPUT(filename);
1244         TRACER_WRITE;
1245     }
1246 
1247 #endif
1248 
1249     grabEMEntryTable(WRITE);
1250 
1251     try
1252     {
1253         grabFreeList(WRITE);
1254     }
1255     catch (...)
1256     {
1257         releaseEMEntryTable(WRITE);
1258         throw;
1259     }
1260 
1261     const char* filename_p = filename.c_str();
1262     scoped_ptr<IDBDataFile>  in(IDBDataFile::open(
1263                                     IDBPolicy::getType(filename_p, IDBPolicy::WRITEENG),
1264                                     filename_p, "r", 0));
1265 
1266     if (!in)
1267     {
1268         log_errno("ExtentMap::load(): open");
1269         releaseFreeList(WRITE);
1270         releaseEMEntryTable(WRITE);
1271         throw ios_base::failure("ExtentMap::load(): open failed. Check the error log.");
1272     }
1273 
1274     try
1275     {
1276         int emVersion = 0;
1277         int bytes = in->read((char*) &emVersion, sizeof(int));
1278 
1279         if (bytes == (int) sizeof(int) && emVersion == EM_MAGIC_V4)
1280             loadVersion4(in.get());
1281         else
1282         {
1283             log("ExtentMap::load(): That file is not a valid ExtentMap image");
1284             throw runtime_error("ExtentMap::load(): That file is not a valid ExtentMap image");
1285         }
1286     }
1287     catch (...)
1288     {
1289         releaseFreeList(WRITE);
1290         releaseEMEntryTable(WRITE);
1291         throw;
1292     }
1293 
1294     releaseFreeList(WRITE);
1295     releaseEMEntryTable(WRITE);
1296 //	checkConsistency();
1297 }
1298 
save(const string & filename)1299 void ExtentMap::save(const string& filename)
1300 {
1301 #ifdef BRM_INFO
1302 
1303     if (fDebug)
1304     {
1305         TRACER_WRITELATER("save");
1306         TRACER_ADDSTRINPUT(filename);
1307         TRACER_WRITE;
1308     }
1309 
1310 #endif
1311 
1312     int allocdSize, loadSize[3], i;
1313 
1314     grabEMEntryTable(READ);
1315 
1316     try
1317     {
1318         grabFreeList(READ);
1319     }
1320     catch (...)
1321     {
1322         releaseEMEntryTable(READ);
1323         throw;
1324     }
1325 
1326     if (fEMShminfo->currentSize == 0)
1327     {
1328         log("ExtentMap::save(): got request to save an empty BRM");
1329         releaseFreeList(READ);
1330         releaseEMEntryTable(READ);
1331         throw runtime_error("ExtentMap::save(): got request to save an empty BRM");
1332     }
1333 
1334     const char* filename_p = filename.c_str();
1335     scoped_ptr<IDBDataFile> out(IDBDataFile::open(
1336                                     IDBPolicy::getType(filename_p, IDBPolicy::WRITEENG),
1337                                     filename_p, "wb", IDBDataFile::USE_VBUF));
1338 
1339     if (!out)
1340     {
1341         log_errno("ExtentMap::save(): open");
1342         releaseFreeList(READ);
1343         releaseEMEntryTable(READ);
1344         throw ios_base::failure("ExtentMap::save(): open failed. Check the error log.");
1345     }
1346 
1347     loadSize[0] = EM_MAGIC_V4;
1348     loadSize[1] = fEMShminfo->currentSize / sizeof(EMEntry);
1349     loadSize[2] = fFLShminfo->allocdSize / sizeof(InlineLBIDRange); // needs to send all entries
1350 
1351     int bytes = 0;
1352 
1353     try
1354     {
1355         const int wsize = 3 * sizeof(int);
1356         bytes = out->write((char*)loadSize, wsize);
1357 
1358         if (bytes != wsize)
1359             throw ios_base::failure("ExtentMap::save(): write failed. Check the error log.");
1360     }
1361     catch (...)
1362     {
1363         releaseFreeList(READ);
1364         releaseEMEntryTable(READ);
1365         throw;
1366     }
1367 
1368     allocdSize = fEMShminfo->allocdSize / sizeof(EMEntry);
1369     //const int emEntrySize = sizeof(EMEntry);
1370 
1371     int first = -1, last = -1, err;
1372     size_t progress, writeSize;
1373     for (i = 0; i < allocdSize; i++)
1374     {
1375         if (fExtentMap[i].range.size > 0 && first == -1)
1376             first = i;
1377         else if (fExtentMap[i].range.size <= 0 && first != -1)
1378         {
1379             last = i;
1380             writeSize = (last - first) * sizeof(EMEntry);
1381             progress = 0;
1382             char *writePos = (char *) &fExtentMap[first];
1383             while (progress < writeSize)
1384             {
1385                 err = out->write(writePos + progress, writeSize - progress);
1386                 if (err < 0)
1387                 {
1388                     releaseFreeList(READ);
1389                     releaseEMEntryTable(READ);
1390                     throw ios_base::failure("ExtentMap::save(): write failed. Check the error log.");
1391                 }
1392                 progress += err;
1393             }
1394             first = -1;
1395         }
1396 
1397     }
1398     if (first != -1)
1399     {
1400         writeSize = (allocdSize - first) * sizeof(EMEntry);
1401         progress = 0;
1402         char *writePos = (char *) &fExtentMap[first];
1403         while (progress < writeSize)
1404         {
1405             err = out->write(writePos + progress, writeSize - progress);
1406             if (err < 0)
1407             {
1408                 releaseFreeList(READ);
1409                 releaseEMEntryTable(READ);
1410                 throw ios_base::failure("ExtentMap::save(): write failed. Check the error log.");
1411             }
1412             progress += err;
1413         }
1414     }
1415 
1416     //allocdSize = fFLShminfo->allocdSize / sizeof(InlineLBIDRange);
1417     //const int inlineLbidRangeSize = sizeof(InlineLBIDRange);
1418 
1419     progress = 0;
1420     writeSize = fFLShminfo->allocdSize;
1421     char *writePos = (char *) fFreeList;
1422     while (progress < writeSize)
1423     {
1424         err = out->write(writePos + progress, writeSize - progress);
1425         if (err < 0)
1426         {
1427             releaseFreeList(READ);
1428             releaseEMEntryTable(READ);
1429             throw ios_base::failure("ExtentMap::save(): write failed. Check the error log.");
1430         }
1431 
1432         progress += err;
1433     }
1434 
1435     releaseFreeList(READ);
1436     releaseEMEntryTable(READ);
1437 }
1438 
1439 /* always returns holding the EM lock, and with the EM seg mapped */
grabEMEntryTable(OPS op)1440 void ExtentMap::grabEMEntryTable(OPS op)
1441 {
1442     boost::mutex::scoped_lock lk(mutex);
1443 
1444     if (op == READ)
1445         fEMShminfo = fMST.getTable_read(MasterSegmentTable::EMTable);
1446     else
1447     {
1448         fEMShminfo = fMST.getTable_write(MasterSegmentTable::EMTable);
1449         emLocked = true;
1450     }
1451 
1452     if (!fPExtMapImpl || fPExtMapImpl->key() != (unsigned)fEMShminfo->tableShmkey)
1453     {
1454         if (fExtentMap != NULL)
1455         {
1456             fExtentMap = NULL;
1457         }
1458 
1459         if (fEMShminfo->allocdSize == 0)
1460         {
1461             if (op == READ)
1462             {
1463                 fMST.getTable_upgrade(MasterSegmentTable::EMTable);
1464                 emLocked = true;
1465 
1466                 if (fEMShminfo->allocdSize == 0)
1467                     growEMShmseg();
1468 
1469                 emLocked = false;	// has to be done holding the write lock
1470                 fMST.getTable_downgrade(MasterSegmentTable::EMTable);
1471             }
1472             else
1473                 growEMShmseg();
1474         }
1475         else
1476         {
1477             fPExtMapImpl = ExtentMapImpl::makeExtentMapImpl(fEMShminfo->tableShmkey, 0);
1478             ASSERT(fPExtMapImpl);
1479 
1480             if (r_only)
1481                 fPExtMapImpl->makeReadOnly();
1482 
1483             fExtentMap = fPExtMapImpl->get();
1484 
1485             if (fExtentMap == NULL)
1486             {
1487                 log_errno("ExtentMap::grabEMEntryTable(): shmat");
1488                 throw runtime_error("ExtentMap::grabEMEntryTable(): shmat failed.  Check the error log.");
1489             }
1490         }
1491     }
1492     else
1493         fExtentMap = fPExtMapImpl->get();
1494 }
1495 
1496 /* always returns holding the FL lock */
grabFreeList(OPS op)1497 void ExtentMap::grabFreeList(OPS op)
1498 {
1499     boost::mutex::scoped_lock lk(mutex, boost::defer_lock);
1500 
1501     if (op == READ)
1502     {
1503         fFLShminfo = fMST.getTable_read(MasterSegmentTable::EMFreeList);
1504         lk.lock();
1505     }
1506     else
1507     {
1508         fFLShminfo = fMST.getTable_write(MasterSegmentTable::EMFreeList);
1509         flLocked = true;
1510     }
1511 
1512     if (!fPFreeListImpl || fPFreeListImpl->key() != (unsigned)fFLShminfo->tableShmkey)
1513     {
1514         if (fFreeList != NULL)
1515         {
1516             fFreeList = NULL;
1517         }
1518 
1519         if (fFLShminfo->allocdSize == 0)
1520         {
1521             if (op == READ)
1522             {
1523                 lk.unlock();
1524                 fMST.getTable_upgrade(MasterSegmentTable::EMFreeList);
1525                 flLocked = true;
1526 
1527                 if (fFLShminfo->allocdSize == 0)
1528                     growFLShmseg();
1529 
1530                 flLocked = false;		// has to be done holding the write lock
1531                 fMST.getTable_downgrade(MasterSegmentTable::EMFreeList);
1532             }
1533             else
1534                 growFLShmseg();
1535         }
1536         else
1537         {
1538             fPFreeListImpl = FreeListImpl::makeFreeListImpl(fFLShminfo->tableShmkey, 0);
1539             ASSERT(fPFreeListImpl);
1540 
1541             if (r_only)
1542                 fPFreeListImpl->makeReadOnly();
1543 
1544             fFreeList = fPFreeListImpl->get();
1545 
1546             if (fFreeList == NULL)
1547             {
1548                 log_errno("ExtentMap::grabFreeList(): shmat");
1549                 throw runtime_error("ExtentMap::grabFreeList(): shmat failed.  Check the error log.");
1550             }
1551 
1552             if (op == READ)
1553                 lk.unlock();
1554         }
1555     }
1556     else
1557     {
1558         fFreeList = fPFreeListImpl->get();
1559 
1560         if (op == READ)
1561             lk.unlock();
1562     }
1563 }
1564 
releaseEMEntryTable(OPS op)1565 void ExtentMap::releaseEMEntryTable(OPS op)
1566 {
1567     if (op == READ)
1568         fMST.releaseTable_read(MasterSegmentTable::EMTable);
1569     else
1570     {
1571         /*
1572            Note: Technically we should mark it unlocked after it's unlocked,
1573            however, that's a race condition.  The only reason the up operation
1574            here will fail is if the underlying semaphore doesn't exist anymore
1575            or there is a locking logic error somewhere else.  Either way,
1576            declaring the EM unlocked here is OK.  Same with all similar assignments.
1577          */
1578         emLocked = false;
1579         fMST.releaseTable_write(MasterSegmentTable::EMTable);
1580     }
1581 }
1582 
releaseFreeList(OPS op)1583 void ExtentMap::releaseFreeList(OPS op)
1584 {
1585     if (op == READ)
1586         fMST.releaseTable_read(MasterSegmentTable::EMFreeList);
1587     else
1588     {
1589         flLocked = false;
1590         fMST.releaseTable_write(MasterSegmentTable::EMFreeList);
1591     }
1592 }
1593 
chooseEMShmkey()1594 key_t ExtentMap::chooseEMShmkey()
1595 {
1596     int fixedKeys = 1;
1597     key_t ret;
1598 
1599     if (fEMShminfo->tableShmkey + 1 == (key_t) (fShmKeys.KEYRANGE_EXTENTMAP_BASE +
1600             fShmKeys.KEYRANGE_SIZE - 1) || (unsigned)fEMShminfo->tableShmkey < fShmKeys.KEYRANGE_EXTENTMAP_BASE)
1601         ret = fShmKeys.KEYRANGE_EXTENTMAP_BASE + fixedKeys;
1602     else
1603         ret = fEMShminfo->tableShmkey + 1;
1604 
1605     return ret;
1606 }
1607 
chooseFLShmkey()1608 key_t ExtentMap::chooseFLShmkey()
1609 {
1610     int fixedKeys = 1, ret;
1611 
1612     if (fFLShminfo->tableShmkey + 1 == (key_t) (fShmKeys.KEYRANGE_EMFREELIST_BASE +
1613             fShmKeys.KEYRANGE_SIZE - 1) || (unsigned)fFLShminfo->tableShmkey < fShmKeys.KEYRANGE_EMFREELIST_BASE)
1614         ret = fShmKeys.KEYRANGE_EMFREELIST_BASE + fixedKeys;
1615     else
1616         ret = fFLShminfo->tableShmkey + 1;
1617 
1618     return ret;
1619 }
1620 
1621 /* Must be called holding the EM write lock
1622    Returns with the new shmseg mapped */
growEMShmseg(size_t nrows)1623 void ExtentMap::growEMShmseg(size_t nrows)
1624 {
1625     size_t allocSize;
1626     key_t newshmkey;
1627 
1628     if (fEMShminfo->allocdSize == 0)
1629         allocSize = EM_INITIAL_SIZE;
1630     else
1631         allocSize = fEMShminfo->allocdSize + EM_INCREMENT;
1632 
1633     newshmkey = chooseEMShmkey();
1634     ASSERT((allocSize == EM_INITIAL_SIZE && !fPExtMapImpl) || fPExtMapImpl);
1635 
1636     //Use the larger of the calculated value or the specified value
1637     allocSize = max(allocSize, nrows * sizeof(EMEntry));
1638 
1639     if (!fPExtMapImpl)
1640     {
1641         fPExtMapImpl = ExtentMapImpl::makeExtentMapImpl(newshmkey, allocSize, r_only);
1642     }
1643     else
1644     {
1645         fPExtMapImpl->grow(newshmkey, allocSize);
1646     }
1647 
1648     fEMShminfo->tableShmkey = newshmkey;
1649     fEMShminfo->allocdSize = allocSize;
1650 
1651     if (r_only)
1652         fPExtMapImpl->makeReadOnly();
1653 
1654     fExtentMap = fPExtMapImpl->get();
1655 }
1656 
1657 /* Must be called holding the FL lock
1658    Returns with the new shmseg mapped */
growFLShmseg()1659 void ExtentMap::growFLShmseg()
1660 {
1661     size_t allocSize;
1662     key_t newshmkey;
1663 
1664     if (fFLShminfo->allocdSize == 0)
1665         allocSize = EM_FREELIST_INITIAL_SIZE;
1666     else
1667         allocSize = fFLShminfo->allocdSize + EM_FREELIST_INCREMENT;
1668 
1669     newshmkey = chooseFLShmkey();
1670     ASSERT((allocSize == EM_FREELIST_INITIAL_SIZE && !fPFreeListImpl) || fPFreeListImpl);
1671 
1672     if (!fPFreeListImpl)
1673         fPFreeListImpl = FreeListImpl::makeFreeListImpl(newshmkey, allocSize, false);
1674     else
1675         fPFreeListImpl->grow(newshmkey, allocSize);
1676 
1677     fFLShminfo->tableShmkey = newshmkey;
1678     fFreeList = fPFreeListImpl->get();
1679 
1680     // init freelist entry
1681     if (fFLShminfo->allocdSize == 0)
1682     {
1683         fFreeList->size = (1ULL << 36) / 1024;
1684         fFLShminfo->currentSize = sizeof(InlineLBIDRange);
1685     }
1686 
1687     fFLShminfo->allocdSize = allocSize;
1688 
1689     if (r_only)
1690         fPFreeListImpl->makeReadOnly();
1691 
1692     fFreeList = fPFreeListImpl->get();
1693 }
1694 
1695 // @bug 1509.  Added new version of lookup that returns the first and last lbid for the extent that contains the
1696 // given lbid.
lookup(LBID_t lbid,LBID_t & firstLbid,LBID_t & lastLbid)1697 int ExtentMap::lookup(LBID_t lbid, LBID_t& firstLbid, LBID_t& lastLbid)
1698 {
1699 #ifdef BRM_INFO
1700 
1701     if (fDebug)
1702     {
1703         TRACER_WRITELATER("lookup");
1704         TRACER_ADDINPUT(lbid);
1705         TRACER_ADDOUTPUT(firstLbid);
1706         TRACER_ADDOUTPUT(lastLbid);
1707         TRACER_WRITE;
1708     }
1709 
1710 #endif
1711     int entries, i;
1712     LBID_t lastBlock;
1713 
1714 #ifdef BRM_DEBUG
1715 
1716 //printEM();
1717     if (lbid < 0)
1718     {
1719         log("ExtentMap::lookup(): lbid must be >= 0", logging::LOG_TYPE_DEBUG);
1720         cout << "ExtentMap::lookup(): lbid must be >= 0.  Lbid passed was " << lbid << endl;
1721         throw invalid_argument("ExtentMap::lookup(): lbid must be >= 0");
1722     }
1723 
1724 #endif
1725 
1726     grabEMEntryTable(READ);
1727     entries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
1728 
1729     for (i = 0; i < entries; i++)
1730     {
1731         if (fExtentMap[i].range.size != 0)
1732         {
1733             lastBlock = fExtentMap[i].range.start +
1734                         (static_cast<LBID_t>(fExtentMap[i].range.size) * 1024) - 1;
1735 
1736             if (lbid >= fExtentMap[i].range.start && lbid <= lastBlock)
1737             {
1738                 firstLbid = fExtentMap[i].range.start;
1739                 lastLbid = lastBlock;
1740                 releaseEMEntryTable(READ);
1741                 return 0;
1742             }
1743         }
1744     }
1745 
1746     releaseEMEntryTable(READ);
1747     return -1;
1748 }
1749 
1750 // @bug 1055+.  New functions added for multiple files per OID enhancement.
lookupLocal(LBID_t lbid,int & OID,uint16_t & dbRoot,uint32_t & partitionNum,uint16_t & segmentNum,uint32_t & fileBlockOffset)1751 int ExtentMap::lookupLocal(LBID_t lbid, int& OID, uint16_t& dbRoot, uint32_t& partitionNum, uint16_t& segmentNum, uint32_t& fileBlockOffset)
1752 {
1753 #ifdef BRM_INFO
1754 
1755     if (fDebug)
1756     {
1757         TRACER_WRITELATER("lookupLocal");
1758         TRACER_ADDINPUT(lbid);
1759         TRACER_ADDOUTPUT(OID);
1760         TRACER_ADDSHORTOUTPUT(dbRoot);
1761         TRACER_ADDOUTPUT(partitionNum);
1762         TRACER_ADDSHORTOUTPUT(segmentNum);
1763         TRACER_ADDOUTPUT(fileBlockOffset);
1764         TRACER_WRITE;
1765     }
1766 
1767 #endif
1768 #ifdef EM_AS_A_TABLE_POC__
1769 
1770     if (lbid >= (1LL << 54))
1771     {
1772         OID = 1084;
1773         dbRoot = 1;
1774         partitionNum = 0;
1775         segmentNum = 0;
1776         fileBlockOffset = 0;
1777         return 0;
1778     }
1779 
1780 #endif
1781     int entries, i, offset;
1782     LBID_t lastBlock;
1783 
1784     if (lbid < 0)
1785     {
1786         ostringstream oss;
1787         oss << "ExtentMap::lookupLocal(): invalid lbid requested: " << lbid;
1788         log(oss.str(), logging::LOG_TYPE_CRITICAL);
1789         throw invalid_argument(oss.str());
1790     }
1791 
1792     grabEMEntryTable(READ);
1793 
1794     entries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
1795 
1796     for (i = 0; i < entries; i++)
1797     {
1798         if (fExtentMap[i].range.size != 0)
1799         {
1800             lastBlock = fExtentMap[i].range.start +
1801                         (static_cast<LBID_t>(fExtentMap[i].range.size) * 1024) - 1;
1802 
1803             if (lbid >= fExtentMap[i].range.start && lbid <= lastBlock)
1804             {
1805                 OID = fExtentMap[i].fileID;
1806                 dbRoot = fExtentMap[i].dbRoot;
1807                 segmentNum = fExtentMap[i].segmentNum;
1808                 partitionNum = fExtentMap[i].partitionNum;
1809 
1810                 // TODO:  Offset logic.
1811                 offset = lbid - fExtentMap[i].range.start;
1812                 fileBlockOffset = fExtentMap[i].blockOffset + offset;
1813 
1814                 releaseEMEntryTable(READ);
1815                 return 0;
1816             }
1817         }
1818     }
1819 
1820     releaseEMEntryTable(READ);
1821     return -1;
1822 }
1823 
lookupLocal(int OID,uint32_t partitionNum,uint16_t segmentNum,uint32_t fileBlockOffset,LBID_t & LBID)1824 int ExtentMap::lookupLocal(int OID, uint32_t partitionNum, uint16_t segmentNum, uint32_t fileBlockOffset, LBID_t& LBID)
1825 {
1826 #ifdef BRM_INFO
1827 
1828     if (fDebug)
1829     {
1830         TRACER_WRITELATER("lookupLocal");
1831         TRACER_ADDINPUT(OID);
1832         TRACER_ADDINPUT(partitionNum);
1833         TRACER_ADDSHORTINPUT(segmentNum);
1834         TRACER_ADDINPUT(fileBlockOffset);
1835         TRACER_ADDOUTPUT(LBID);
1836         TRACER_WRITE;
1837     }
1838 
1839 #endif
1840     int entries, i, offset;
1841 
1842     if (OID < 0)
1843     {
1844         log("ExtentMap::lookup(): OID and FBO must be >= 0", logging::LOG_TYPE_DEBUG);
1845         throw invalid_argument("ExtentMap::lookup(): OID and FBO must be >= 0");
1846     }
1847 
1848     grabEMEntryTable(READ);
1849 
1850     entries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
1851 
1852     for (i = 0; i < entries; i++)
1853     {
1854 
1855         // TODO:  Blockoffset logic.
1856         if (fExtentMap[i].range.size != 0 &&
1857                 fExtentMap[i].fileID == OID &&
1858                 fExtentMap[i].partitionNum == partitionNum &&
1859                 fExtentMap[i].segmentNum == segmentNum &&
1860                 fExtentMap[i].blockOffset <= fileBlockOffset &&
1861                 fileBlockOffset <= (fExtentMap[i].blockOffset +
1862                                     (static_cast<LBID_t>(fExtentMap[i].range.size) * 1024) - 1))
1863         {
1864 
1865             offset = fileBlockOffset - fExtentMap[i].blockOffset;
1866             LBID = fExtentMap[i].range.start + offset;
1867             releaseEMEntryTable(READ);
1868             return 0;
1869         }
1870     }
1871 
1872     releaseEMEntryTable(READ);
1873     return -1;
1874 }
1875 
lookupLocal_DBroot(int OID,uint16_t dbroot,uint32_t partitionNum,uint16_t segmentNum,uint32_t fileBlockOffset,LBID_t & LBID)1876 int ExtentMap::lookupLocal_DBroot(int OID, uint16_t dbroot, uint32_t partitionNum, uint16_t segmentNum,
1877                                   uint32_t fileBlockOffset, LBID_t& LBID)
1878 {
1879 #ifdef BRM_INFO
1880 
1881     if (fDebug)
1882     {
1883         TRACER_WRITELATER("lookupLocal");
1884         TRACER_ADDINPUT(OID);
1885         TRACER_ADDINPUT(partitionNum);
1886         TRACER_ADDSHORTINPUT(segmentNum);
1887         TRACER_ADDINPUT(fileBlockOffset);
1888         TRACER_ADDOUTPUT(LBID);
1889         TRACER_WRITE;
1890     }
1891 
1892 #endif
1893     int entries, i, offset;
1894 
1895     if (OID < 0)
1896     {
1897         log("ExtentMap::lookup(): OID and FBO must be >= 0", logging::LOG_TYPE_DEBUG);
1898         throw invalid_argument("ExtentMap::lookup(): OID and FBO must be >= 0");
1899     }
1900 
1901     grabEMEntryTable(READ);
1902 
1903     entries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
1904 
1905     for (i = 0; i < entries; i++)
1906     {
1907 
1908         // TODO:  Blockoffset logic.
1909         if (fExtentMap[i].range.size != 0 &&
1910                 fExtentMap[i].fileID == OID &&
1911                 fExtentMap[i].dbRoot == dbroot &&
1912                 fExtentMap[i].partitionNum == partitionNum &&
1913                 fExtentMap[i].segmentNum == segmentNum &&
1914                 fExtentMap[i].blockOffset <= fileBlockOffset &&
1915                 fileBlockOffset <= (fExtentMap[i].blockOffset +
1916                                     (static_cast<LBID_t>(fExtentMap[i].range.size) * 1024) - 1))
1917         {
1918 
1919             offset = fileBlockOffset - fExtentMap[i].blockOffset;
1920             LBID = fExtentMap[i].range.start + offset;
1921             releaseEMEntryTable(READ);
1922             return 0;
1923         }
1924     }
1925 
1926     releaseEMEntryTable(READ);
1927     return -1;
1928 }
1929 
1930 // @bug 1055-.
1931 
1932 //------------------------------------------------------------------------------
1933 // Lookup/return starting LBID for the specified OID, partition, segment, and
1934 // file block offset.
1935 //------------------------------------------------------------------------------
lookupLocalStartLbid(int OID,uint32_t partitionNum,uint16_t segmentNum,uint32_t fileBlockOffset,LBID_t & LBID)1936 int ExtentMap::lookupLocalStartLbid(int      OID,
1937                                     uint32_t partitionNum,
1938                                     uint16_t segmentNum,
1939                                     uint32_t fileBlockOffset,
1940                                     LBID_t&  LBID)
1941 {
1942 #ifdef BRM_INFO
1943 
1944     if (fDebug)
1945     {
1946         TRACER_WRITELATER("lookupLocalStartLbid");
1947         TRACER_ADDINPUT(OID);
1948         TRACER_ADDINPUT(partitionNum);
1949         TRACER_ADDSHORTINPUT(segmentNum);
1950         TRACER_ADDINPUT(fileBlockOffset);
1951         TRACER_ADDOUTPUT(LBID);
1952         TRACER_WRITE;
1953     }
1954 
1955 #endif
1956     int entries, i;
1957 
1958     if (OID < 0)
1959     {
1960         log("ExtentMap::lookupLocalStartLbid(): OID and FBO must be >= 0",
1961             logging::LOG_TYPE_DEBUG);
1962         throw invalid_argument("ExtentMap::lookupLocalStartLbid(): "
1963                                "OID and FBO must be >= 0");
1964     }
1965 
1966     grabEMEntryTable(READ);
1967     entries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
1968 
1969     for (i = 0; i < entries; i++)
1970     {
1971         if (fExtentMap[i].range.size   != 0 &&
1972                 fExtentMap[i].fileID       == OID &&
1973                 fExtentMap[i].partitionNum == partitionNum &&
1974                 fExtentMap[i].segmentNum   == segmentNum &&
1975                 fExtentMap[i].blockOffset  <= fileBlockOffset &&
1976                 fileBlockOffset <= (fExtentMap[i].blockOffset +
1977                                     (static_cast<LBID_t>(fExtentMap[i].range.size) * 1024) - 1))
1978         {
1979             LBID = fExtentMap[i].range.start;
1980             releaseEMEntryTable(READ);
1981             return 0;
1982         }
1983     }
1984 
1985     releaseEMEntryTable(READ);
1986 
1987     return -1;
1988 }
1989 
1990 //------------------------------------------------------------------------------
1991 // Creates a "stripe" of column extents across a table, for the specified
1992 // columns and DBRoot.
1993 //   cols         - Vector of columns OIDs and widths to be allocated
1994 //   dbRoot       - DBRoot to be used for new extents
1995 //   partitionNum - when creating the first extent for a column (on dbRoot),
1996 //                  partitionNum must be specified as an input argument.
1997 //                  If not the first extent on dbRoot, then partitionNum
1998 //                  for the new extents will be assigned and returned, based
1999 //                  on the current last extent for dbRoot.
2000 // output:
2001 //   partitionNum - Partition number for new extents
2002 //   segmentNum   - Segment number for new exents
2003 //   extents      - starting Lbid, numBlocks, and FBO for new extents
2004 //------------------------------------------------------------------------------
createStripeColumnExtents(const vector<CreateStripeColumnExtentsArgIn> & cols,uint16_t dbRoot,uint32_t & partitionNum,uint16_t & segmentNum,vector<CreateStripeColumnExtentsArgOut> & extents)2005 void ExtentMap::createStripeColumnExtents(
2006     const vector<CreateStripeColumnExtentsArgIn>& cols,
2007     uint16_t  dbRoot,
2008     uint32_t& partitionNum,
2009     uint16_t& segmentNum,
2010     vector<CreateStripeColumnExtentsArgOut>& extents)
2011 {
2012     LBID_t    startLbid;
2013     int       allocSize;
2014     uint32_t startBlkOffset;
2015 
2016     grabEMEntryTable(WRITE);
2017     grabFreeList(WRITE);
2018 
2019     OID_t     baselineOID = -1;
2020     uint16_t baselineSegmentNum = -1;
2021     uint32_t baselinePartNum = -1;
2022 
2023     for (uint32_t i = 0; i < cols.size(); i++)
2024     {
2025         createColumnExtent_DBroot(
2026             cols[i].oid,
2027             cols[i].width,
2028             dbRoot,
2029             cols[i].colDataType,
2030             partitionNum,
2031             segmentNum,
2032             startLbid,
2033             allocSize,
2034             startBlkOffset,
2035             false);
2036 
2037         if (i == 0)
2038         {
2039             baselineOID        = cols[i].oid;
2040             baselineSegmentNum = segmentNum;
2041             baselinePartNum    = partitionNum;
2042         }
2043         else
2044         {
2045             if ((segmentNum   != baselineSegmentNum) ||
2046                     (partitionNum != baselinePartNum))
2047             {
2048                 ostringstream oss;
2049                 oss << "ExtentMap::createStripeColumnExtents(): "
2050                     "Inconsistent segment extent creation: " <<
2051                     "DBRoot: "         << dbRoot <<
2052                     "OID1: "           << baselineOID <<
2053                     "; Part#: "        << baselinePartNum <<
2054                     "; Seg#: "         << baselineSegmentNum <<
2055                     " <versus> OID2: " << cols[i].oid <<
2056                     "; Part#: "        << partitionNum <<
2057                     "; Seg#: "         << segmentNum;
2058                 log(oss.str(), logging::LOG_TYPE_CRITICAL);
2059                 throw invalid_argument(oss.str());
2060             }
2061         }
2062 
2063         CreateStripeColumnExtentsArgOut extentInfo;
2064         extentInfo.startLbid      = startLbid;
2065         extentInfo.allocSize      = allocSize;
2066         extentInfo.startBlkOffset = startBlkOffset;
2067         extents.push_back( extentInfo );
2068     }
2069 }
2070 
2071 //------------------------------------------------------------------------------
2072 // Creates an extent for a column file on the specified DBRoot.  This is the
2073 // external API function referenced by the dbrm wrapper class.
2074 // required input:
2075 //   OID          - column OID for which the extent is to be created
2076 //   colWidth     - width of column in bytes
2077 //   dbRoot       - DBRoot where extent is to be added
2078 //   partitionNum - when creating the first extent for a column (on dbRoot),
2079 //                  partitionNum must be specified as an input argument.
2080 //                  If not the first extent on dbRoot, then partitionNum
2081 //                  for the new extent will be assigned and returned, based
2082 //                  on the current last extent for dbRoot.
2083 //   useLock      - Grab ExtentMap and FreeList WRITE lock to perform work
2084 // output:
2085 //   partitionNum - partition number for the new extent
2086 //   segmentNum   - segment number for the new extent
2087 //   lbid         - starting LBID of the created extent
2088 //   allocdsize   - number of LBIDs allocated
2089 //   startBlockOffset-starting block of the created extent
2090 //------------------------------------------------------------------------------
createColumnExtent_DBroot(int OID,uint32_t colWidth,uint16_t dbRoot,execplan::CalpontSystemCatalog::ColDataType colDataType,uint32_t & partitionNum,uint16_t & segmentNum,LBID_t & lbid,int & allocdsize,uint32_t & startBlockOffset,bool useLock)2091 void ExtentMap::createColumnExtent_DBroot(int OID,
2092         uint32_t  colWidth,
2093         uint16_t  dbRoot,
2094         execplan::CalpontSystemCatalog::ColDataType colDataType,
2095         uint32_t& partitionNum,
2096         uint16_t& segmentNum,
2097         LBID_t&    lbid,
2098         int&       allocdsize,
2099         uint32_t& startBlockOffset,
2100         bool       useLock) // defaults to true
2101 {
2102 #ifdef BRM_INFO
2103 
2104     if (fDebug)
2105     {
2106         TRACER_WRITELATER("createColumnExtent_DBroot");
2107         TRACER_ADDINPUT(OID);
2108         TRACER_ADDINPUT(colWidth);
2109         TRACER_ADDSHORTINPUT(dbRoot);
2110         TRACER_ADDOUTPUT(partitionNum);
2111         TRACER_ADDSHORTOUTPUT(segmentNum);
2112         TRACER_ADDINT64OUTPUT(lbid);
2113         TRACER_ADDOUTPUT(allocdsize);
2114         TRACER_ADDOUTPUT(startBlockOffset);
2115         TRACER_WRITE;
2116     }
2117 
2118 #endif
2119 
2120 #ifdef BRM_DEBUG
2121 
2122     if (OID <= 0)
2123     {
2124         log("ExtentMap::createColumnExtent_DBroot(): OID must be > 0",
2125             logging::LOG_TYPE_DEBUG);
2126         throw invalid_argument(
2127             "ExtentMap::createColumnExtent_DBroot(): OID must be > 0");
2128     }
2129 
2130 #endif
2131 
2132     // Convert extent size in rows to extent size in 8192-byte blocks.
2133     // extentRows should be multiple of blocksize (8192).
2134     const unsigned EXTENT_SIZE = (getExtentRows() * colWidth) / BLOCK_SIZE;
2135 
2136     if (useLock)
2137     {
2138         grabEMEntryTable(WRITE);
2139         grabFreeList(WRITE);
2140     }
2141 
2142     if (fEMShminfo->currentSize == fEMShminfo->allocdSize)
2143         growEMShmseg();
2144 
2145 //  size is the number of multiples of 1024 blocks.
2146 //  ex: size=1 --> 1024 blocks
2147 //      size=2 --> 2048 blocks
2148 //      size=3 --> 3072 blocks, etc.
2149     uint32_t size = EXTENT_SIZE / 1024;
2150 
2151     lbid = _createColumnExtent_DBroot(size, OID, colWidth,
2152                                       dbRoot, colDataType, partitionNum, segmentNum, startBlockOffset);
2153 
2154     allocdsize = EXTENT_SIZE;
2155 }
2156 
2157 //------------------------------------------------------------------------------
2158 // Creates an extent for a column file for the specified DBRoot.  This is the
2159 // internal implementation function.
2160 // input:
2161 //   size         - number of multiples of 1024 blocks allocated to the extent
2162 //                  ex: size=1 --> 1024 blocks
2163 //                      size=2 --> 2048 blocks
2164 //                      size=3 --> 3072 blocks, etc.
2165 //   OID          - column OID for which the extent is to be created
2166 //   colWidth     - width of column in bytes
2167 //   dbRoot       - dbRoot where extent is to be added
2168 //   partitionNum - when creating the first extent for an empty dbRoot,
2169 //                  partitionNum must be specified as an input argument.
2170 // output:
2171 //   partitionNum - when adding an extent to a dbRoot,
2172 //                  partitionNum will be the assigned partition number
2173 //   segmentNum   - segment number for the new extent
2174 //   startBlockOffset-starting block of the created extent
2175 // returns starting LBID of the created extent.
2176 //------------------------------------------------------------------------------
_createColumnExtent_DBroot(uint32_t size,int OID,uint32_t colWidth,uint16_t dbRoot,execplan::CalpontSystemCatalog::ColDataType colDataType,uint32_t & partitionNum,uint16_t & segmentNum,uint32_t & startBlockOffset)2177 LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID,
2178         uint32_t  colWidth,
2179         uint16_t  dbRoot,
2180         execplan::CalpontSystemCatalog::ColDataType colDataType,
2181         uint32_t& partitionNum,
2182         uint16_t& segmentNum,
2183         uint32_t& startBlockOffset)
2184 {
2185     int emptyEMEntry        = -1;
2186     int lastExtentIndex     = -1;
2187     uint32_t highestOffset = 0;
2188     uint32_t highestPartNum = 0;
2189     uint16_t highestSegNum = 0;
2190     const unsigned FILES_PER_COL_PART = getFilesPerColumnPartition();
2191     const unsigned EXTENT_ROWS        = getExtentRows();
2192     const unsigned EXTENTS_PER_SEGFILE = getExtentsPerSegmentFile();
2193     const unsigned DBROOT_COUNT       = getDbRootCount();
2194 
2195     // Variables that track list of segfiles in target (HWM) DBRoot & partition.
2196     // Map segment number to the highest fbo extent in each file
2197     typedef tr1::unordered_map<uint16_t, uint32_t> TargetDbRootSegsMap;
2198     typedef TargetDbRootSegsMap::iterator          TargetDbRootSegsMapIter;
2199     typedef TargetDbRootSegsMap::const_iterator    TargetDbRootSegsMapConstIter;
2200     TargetDbRootSegsMap targetDbRootSegs;
2201 
2202     uint32_t highEmptySegNum = 0; // high seg num for user specified partition;
2203     // only comes into play for empty DBRoot.
2204     bool bHighEmptySegNumSet = false;
2205 
2206     //--------------------------------------------------------------------------
2207     // First Step: Scan ExtentMap
2208     // 1. find HWM extent in relevant DBRoot
2209     // 2. if DBRoot is empty, track highest seg num in user specified partition
2210     // 3. Find first unused extent map entry
2211     //--------------------------------------------------------------------------
2212     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
2213 
2214     LBID_t startLBID = getLBIDsFromFreeList( size );
2215 
2216     // Find the first empty Entry; and find last extent for this OID and dbRoot
2217     for (int i = 0; i < emEntries; i++)
2218     {
2219         if (fExtentMap[i].range.size  != 0)
2220         {
2221             if (fExtentMap[i].fileID == OID)
2222             {
2223 
2224                 // 1. Find HWM extent in relevant DBRoot
2225                 if (fExtentMap[i].dbRoot == dbRoot)
2226                 {
2227                     if ( (fExtentMap[i].partitionNum >  highestPartNum) ||
2228                             ((fExtentMap[i].partitionNum == highestPartNum) &&
2229                              (fExtentMap[i].blockOffset   >  highestOffset)) ||
2230                             ((fExtentMap[i].partitionNum == highestPartNum) &&
2231                              (fExtentMap[i].blockOffset   == highestOffset)  &&
2232                              (fExtentMap[i].segmentNum    >= highestSegNum)) )
2233                     {
2234 
2235                         lastExtentIndex = i;
2236                         highestPartNum  = fExtentMap[i].partitionNum;
2237                         highestSegNum   = fExtentMap[i].segmentNum;
2238                         highestOffset   = fExtentMap[i].blockOffset;
2239                     }
2240                 }
2241 
2242                 // 2. for empty DBRoot track hi seg# in user specified part#
2243                 if ((lastExtentIndex == -1) &&
2244                         (fExtentMap[i].partitionNum == partitionNum))
2245                 {
2246                     if ((fExtentMap[i].segmentNum > highEmptySegNum) ||
2247                             (!bHighEmptySegNumSet))
2248                     {
2249                         highEmptySegNum = fExtentMap[i].segmentNum;
2250                         bHighEmptySegNumSet = true;
2251                     }
2252                 }
2253             }         // found extentmap entry for specified OID
2254         }             // found valid extentmap entry
2255 
2256         // 3. Find first available extent map entry that can be reused
2257         else if (emptyEMEntry < 0)
2258             emptyEMEntry = i;
2259     } // Loop through extent map entries
2260 
2261     if (emptyEMEntry == -1)
2262     {
2263         ostringstream oss;
2264         oss << "ExtentMap::_createColumnExtent_DBroot(): "
2265             "could not find an empty EMEntry for OID " << OID <<
2266             "; Extent Map is full",
2267             log(oss.str(),
2268                 logging::LOG_TYPE_CRITICAL);
2269         throw logic_error( oss.str() );
2270     }
2271 
2272     //--------------------------------------------------------------------------
2273     // If DBRoot is not empty, then...
2274     // Second Step: Scan ExtentMap again after I know the last partition
2275     // 4. track highest seg num for HWM+1 partition
2276     // 5. track highest seg num for HWM    partition
2277     // 6. save list of segment numbers and fbos in target DBRoot and partition
2278     //
2279     // Scanning the extentmap a second time is not a good thing to be doing.
2280     // But the alternative isn't good either.  There is certain information
2281     // I need to capture about the last partition and DBRoot, and for the next
2282     // partition as well (which may contain segment files on other DBRoots),
2283     // but until I scan the extentmap, I don't know what my last partition is.
2284     // If I try to do this in a single scan, then I am forced to spend time
2285     // capturing information about partitions that turn out to be inconse-
2286     // quential because the "known" last partition will keep changing as I
2287     // scan the extentmap.
2288     //--------------------------------------------------------------------------
2289     bool bSegsOutOfService = false;
2290     int partHighSeg     = -1; // hi seg num for last partition
2291     int partHighSegNext = -1; // hi seg num for next partition
2292 
2293     if (lastExtentIndex >= 0)
2294     {
2295         uint32_t targetDbRootPart = fExtentMap[lastExtentIndex].partitionNum;
2296         uint32_t targetDbRootPartNext = targetDbRootPart + 1;
2297         partHighSeg                = fExtentMap[lastExtentIndex].segmentNum;
2298         targetDbRootSegs.insert( TargetDbRootSegsMap::value_type(
2299                                      fExtentMap[lastExtentIndex].segmentNum,
2300                                      fExtentMap[lastExtentIndex].blockOffset) );
2301 
2302         for (int i = 0; i < emEntries; i++)
2303         {
2304             if (fExtentMap[i].range.size  != 0)
2305             {
2306                 if (fExtentMap[i].fileID == OID)
2307                 {
2308 
2309                     // 4. Track hi seg for hwm+1 partition
2310                     if (fExtentMap[i].partitionNum == targetDbRootPartNext)
2311                     {
2312                         if (fExtentMap[i].segmentNum > partHighSegNext)
2313                         {
2314                             partHighSegNext = fExtentMap[i].segmentNum;
2315                         }
2316                     }
2317 
2318                     // 5. Track hi seg for hwm partition
2319                     else if (fExtentMap[i].partitionNum == targetDbRootPart)
2320                     {
2321                         if (fExtentMap[i].segmentNum > partHighSeg)
2322                         {
2323                             partHighSeg = fExtentMap[i].segmentNum;
2324                         }
2325 
2326                         // 6. Save list of seg files in target DBRoot/Partition,
2327                         //    along with the highest fbo for each seg file
2328                         if (fExtentMap[i].dbRoot == dbRoot)
2329                         {
2330                             if (fExtentMap[i].status == EXTENTOUTOFSERVICE)
2331                                 bSegsOutOfService = true;
2332 
2333                             TargetDbRootSegsMapIter iter =
2334                                 targetDbRootSegs.find(fExtentMap[i].segmentNum);
2335 
2336                             if (iter == targetDbRootSegs.end())
2337                             {
2338                                 targetDbRootSegs.insert(
2339                                     TargetDbRootSegsMap::value_type(
2340                                         fExtentMap[i].segmentNum,
2341                                         fExtentMap[i].blockOffset) );
2342                             }
2343                             else
2344                             {
2345                                 if (fExtentMap[i].blockOffset > iter->second)
2346                                 {
2347                                     iter->second = fExtentMap[i].blockOffset;
2348                                 }
2349                             }
2350                         }
2351                     }
2352                 }   // found extentmap entry for specified OID
2353             }       // found valid extentmap entry
2354         }           // loop through extent map entries
2355     }               // (lastExtentIndex >= 0)
2356 
2357     //--------------------------------------------------------------------------
2358     // Third Step: Select partition and segment number for new extent
2359     // 1. Loop through targetDbRootSegs to find segment file for next extent
2360     // 2. Check for exceptions that warrant going to next physical partition
2361     //    a. See if any extents are marked outOfService
2362     //    b. See if extents are not evenly layered as expected
2363     // 3. Perform additional new partition/segment logic as applicable
2364     //    a. No action taken if 2a or 2b already detected need for new partition
2365     //    b. If HWM extent is in last file of DBRoot/Partition, see if next
2366     //       extent goes in new partition, or if wrap-around within current
2367     //       partition.
2368     //    c. If extent needs to go in next partition, figure out the next
2369     //       partition and the next available segment in that partition.
2370     // 4. Set blockOffset of new extent based on where extent is being added
2371     //--------------------------------------------------------------------------
2372     uint16_t newDbRoot       = dbRoot;
2373     uint32_t newPartitionNum = partitionNum;
2374     uint16_t newSegmentNum   = 0;
2375     uint32_t newBlockOffset  = 0;
2376 
2377     // If this is not the first extent for this OID and DBRoot then
2378     //   extrapolate part# and seg# from last extent; wrap around segment and
2379     //   partition number as needed.
2380     // else
2381     //   use part# that the user specifies
2382     if (lastExtentIndex >= 0)
2383     {
2384         bool startNewPartition       = false;
2385         bool startNewStripeInSegFile = false;
2386         const unsigned int filesPerDBRootPerPartition =
2387             FILES_PER_COL_PART / DBROOT_COUNT;
2388 
2389         int& lastExtIdx = lastExtentIndex;
2390 
2391         // Find first, last, next seg files in target partition and DBRoot
2392         uint16_t firstTargetSeg = fExtentMap[lastExtIdx].segmentNum;
2393         uint16_t lastTargetSeg  = fExtentMap[lastExtIdx].segmentNum;
2394         uint16_t nextTargetSeg  = fExtentMap[lastExtIdx].segmentNum;
2395 
2396         // 1. Loop thru targetDbRootSegs[] to find next segment after
2397         //    lastExtIdx in target list.
2398         //    We save low and high segment to use in wrap-around case.
2399         if (targetDbRootSegs.size() > 1)
2400         {
2401             bool bNextSegSet = false;
2402 
2403             for (TargetDbRootSegsMapConstIter iter = targetDbRootSegs.begin();
2404                     iter != targetDbRootSegs.end();
2405                     ++iter)
2406             {
2407                 uint16_t targetSeg = iter->first;
2408 
2409                 if (targetSeg      < firstTargetSeg)
2410                     firstTargetSeg = targetSeg;
2411                 else if (targetSeg > lastTargetSeg)
2412                     lastTargetSeg  = targetSeg;
2413 
2414                 if (targetSeg > fExtentMap[lastExtIdx].segmentNum)
2415                 {
2416                     if ((targetSeg < nextTargetSeg) || (!bNextSegSet))
2417                     {
2418                         nextTargetSeg = targetSeg;
2419                         bNextSegSet   = true;
2420                     }
2421                 }
2422             }
2423         }
2424 
2425         newPartitionNum = fExtentMap[lastExtIdx].partitionNum;
2426 
2427         // 2a. Skip to next physical partition if any extents in HWM partition/
2428         //     DBRoot are marked as outOfService
2429         if (bSegsOutOfService)
2430         {
2431 
2432 //			cout << "Skipping to next partition (outOfService segs)" <<
2433 //				": oid-"  << fExtentMap[lastExtentIndex].fileID <<
2434 //				"; root-" << fExtentMap[lastExtentIndex].dbRoot <<
2435 //				"; part-" << fExtentMap[lastExtentIndex].partitionNum << endl;
2436 
2437             startNewPartition = true;
2438         }
2439 
2440         // @bug 4765
2441         // 2b. Skip to next physical partition if we have a set of
2442         // segment files that are not "layered" as expected, meaning we
2443         // have > 1 layer of extents with an incomplete lower layer (could
2444         // be caused by the dropping of logical partitions).
2445         else if (targetDbRootSegs.size() < filesPerDBRootPerPartition)
2446         {
2447             for (TargetDbRootSegsMapConstIter iter = targetDbRootSegs.begin();
2448                     iter != targetDbRootSegs.end();
2449                     ++iter)
2450             {
2451                 if (iter->second > 0)
2452                 {
2453 
2454 //					cout << "Skipping to next partition (unbalanced)" <<
2455 //						": oid-"  << fExtentMap[lastExtentIndex].fileID <<
2456 //						"; root-" << fExtentMap[lastExtentIndex].dbRoot <<
2457 //						"; part-" << fExtentMap[lastExtentIndex].partitionNum <<
2458 //						"; seg-"  << iter->first  <<
2459 //						"; hifbo-"<< iter->second << endl;
2460 
2461                     startNewPartition = true;
2462                     break;
2463                 }
2464             }
2465         }
2466 
2467         // 3a.If we already detected need for new partition, then take no action
2468         if (startNewPartition)
2469         {
2470             // no action taken here; we take additional action later.
2471         }
2472 
2473         // 3b.If HWM extent is in last seg file for this partition and DBRoot,
2474         //    find out if we need to add a new partition for next extent.
2475         else if (targetDbRootSegs.size() >= filesPerDBRootPerPartition)
2476         {
2477             if (fExtentMap[lastExtIdx].segmentNum == lastTargetSeg)
2478             {
2479                 // Use blockOffset of lastExtIdx to see if we need to add
2480                 // the next extent to a new partition.
2481                 if (fExtentMap[lastExtIdx].blockOffset ==
2482                         ((EXTENTS_PER_SEGFILE - 1) *
2483                          (EXTENT_ROWS * colWidth / BLOCK_SIZE)) )
2484                 {
2485                     startNewPartition = true;
2486                 }
2487                 else   // Wrap-around; add extent to low seg in this partition
2488                 {
2489                     startNewStripeInSegFile = true;
2490                     newSegmentNum = firstTargetSeg;
2491                 }
2492             }
2493             else
2494             {
2495                 newSegmentNum = nextTargetSeg;
2496             }
2497         }
2498         else   // Select next segment file in current HWM partition
2499         {
2500             newSegmentNum = partHighSeg + 1;
2501         }
2502 
2503         // 3c. Find new partition and segment if we can't create
2504         //     an extent for this DBRoot in the current HWM partition.
2505         if (startNewPartition)
2506         {
2507             newPartitionNum++;
2508 
2509             if (partHighSegNext == -1)
2510                 newSegmentNum = 0;
2511             else
2512                 newSegmentNum = partHighSegNext + 1;
2513         }
2514 
2515         // 4. Set blockOffset (fbo) for new extent relative to it's seg file
2516         // case1: Init fbo to 0 if first extent in partition/DbRoot
2517         // case2: Init fbo to 0 if first extent in segment file (other than
2518         //        first segment in this partition/DbRoot, which case1 handled)
2519         // case3: Init fbo based on previous extent
2520 
2521         // case1: leave newBlockOffset set to 0
2522         if (startNewPartition)
2523         {
2524             //...no action necessary
2525         }
2526 
2527         // case2: leave newBlockOffset set to 0
2528         else if ((fExtentMap[lastExtIdx].blockOffset == 0) &&
2529                  (newSegmentNum > firstTargetSeg))
2530         {
2531             //...no action necessary
2532         }
2533 
2534         // case3: Init blockOffset based on previous extent.  If we are adding
2535         //        extent to 1st seg file, then need to bump up the offset; else
2536         //        adding extent to same stripe and can repeat the same offset.
2537         else
2538         {
2539             if (startNewStripeInSegFile)        // start next stripe
2540             {
2541                 newBlockOffset = static_cast<uint64_t>
2542                                  (fExtentMap[lastExtIdx].range.size) * 1024 +
2543                                  fExtentMap[lastExtIdx].blockOffset;
2544             }
2545             else  								// next extent, same stripe
2546             {
2547                 newBlockOffset = fExtentMap[lastExtIdx].blockOffset;
2548             }
2549         }
2550     }   // lastExtentIndex >= 0
2551     else  	// Empty DBRoot; use part# that the user specifies
2552     {
2553         if (bHighEmptySegNumSet)
2554             newSegmentNum = highEmptySegNum + 1;
2555         else
2556             newSegmentNum = 0;
2557     }
2558 
2559     //--------------------------------------------------------------------------
2560     // Fourth Step: Construct the new extentmap entry
2561     //--------------------------------------------------------------------------
2562 
2563     makeUndoRecord(&fExtentMap[emptyEMEntry], sizeof(EMEntry));
2564     EMEntry* e      = &fExtentMap[emptyEMEntry];
2565 
2566     e->range.start  = startLBID;
2567     e->range.size   = size;
2568     e->fileID       = OID;
2569 
2570     if (isUnsigned(colDataType))
2571     {
2572         e->partition.cprange.lo_val = numeric_limits<uint64_t>::max();
2573         e->partition.cprange.hi_val = 0;
2574     }
2575     else
2576     {
2577         e->partition.cprange.lo_val = numeric_limits<int64_t>::max();
2578         e->partition.cprange.hi_val = numeric_limits<int64_t>::min();
2579     }
2580 
2581     e->partition.cprange.sequenceNum = 0;
2582 
2583     e->colWid       = colWidth;
2584 
2585     e->dbRoot       = newDbRoot;
2586     e->partitionNum = newPartitionNum;
2587     e->segmentNum   = newSegmentNum;
2588 
2589     e->blockOffset  = newBlockOffset;
2590     e->HWM          = 0;
2591     e->status       = EXTENTUNAVAILABLE; // mark extent as in process
2592 
2593     // Partition, segment, and blockOffset 0 represents new table or column.
2594     // When DDL creates a table, we can mark the first extent as VALID, since
2595     // the table has no data.  Marking as VALID enables cpimport to update
2596     // the CP min/max for the first import.
2597     // If DDL is adding a column to an existing table, setting to VALID won't
2598     // hurt, because DDL resets to INVALID after the extent is created.
2599     if ((e->partitionNum == 0) &&
2600             (e->segmentNum   == 0) &&
2601             (e->blockOffset  == 0))
2602         e->partition.cprange.isValid = CP_VALID;
2603     else
2604         e->partition.cprange.isValid = CP_INVALID;
2605 
2606     partitionNum    = e->partitionNum;
2607     segmentNum      = e->segmentNum;
2608     startBlockOffset = e->blockOffset;
2609 
2610     makeUndoRecord(fEMShminfo, sizeof(MSTEntry));
2611     fEMShminfo->currentSize += sizeof(struct EMEntry);
2612 
2613     return startLBID;
2614 }
2615 
2616 //------------------------------------------------------------------------------
2617 // Creates an extent for the exact segment column file specified by the
2618 // requested OID, DBRoot, partition number, and segment number.  This is
2619 // the external API function referenced by the dbrm wrapper class.
2620 // required input:
2621 //   OID          - column OID for which the extent is to be created
2622 //   colWidth     - width of column in bytes
2623 //   dbRoot       - DBRoot where extent is to be added
2624 //   partitionNum - partitionNum
2625 //   segmentNum   - segmentNum
2626 // output:
2627 //   lbid         - starting LBID of the created extent
2628 //   allocdsize   - number of LBIDs allocated
2629 //   startBlockOffset-starting block of the created extent
2630 //------------------------------------------------------------------------------
createColumnExtentExactFile(int OID,uint32_t colWidth,uint16_t dbRoot,uint32_t partitionNum,uint16_t segmentNum,execplan::CalpontSystemCatalog::ColDataType colDataType,LBID_t & lbid,int & allocdsize,uint32_t & startBlockOffset)2631 void ExtentMap::createColumnExtentExactFile(int OID,
2632         uint32_t  colWidth,
2633         uint16_t  dbRoot,
2634         uint32_t partitionNum,
2635         uint16_t segmentNum,
2636         execplan::CalpontSystemCatalog::ColDataType colDataType,
2637         LBID_t&    lbid,
2638         int&       allocdsize,
2639         uint32_t& startBlockOffset)
2640 {
2641 #ifdef BRM_INFO
2642 
2643     if (fDebug)
2644     {
2645         TRACER_WRITELATER("createColumnExtentExactFile");
2646         TRACER_ADDINPUT(OID);
2647         TRACER_ADDINPUT(colWidth);
2648         TRACER_ADDSHORTINPUT(dbRoot);
2649         TRACER_ADDOUTPUT(partitionNum);
2650         TRACER_ADDSHORTOUTPUT(segmentNum);
2651         TRACER_ADDINT64OUTPUT(lbid);
2652         TRACER_ADDOUTPUT(allocdsize);
2653         TRACER_ADDOUTPUT(startBlockOffset);
2654         TRACER_WRITE;
2655     }
2656 
2657 #endif
2658 
2659 #ifdef BRM_DEBUG
2660 
2661     if (OID <= 0)
2662     {
2663         log("ExtentMap::createColumnExtentExactFile(): OID must be > 0",
2664             logging::LOG_TYPE_DEBUG);
2665         throw invalid_argument(
2666             "ExtentMap::createColumnExtentExactFile(): OID must be > 0");
2667     }
2668 
2669 #endif
2670 
2671     // Convert extent size in rows to extent size in 8192-byte blocks.
2672     // extentRows should be multiple of blocksize (8192).
2673     const unsigned EXTENT_SIZE = (getExtentRows() * colWidth) / BLOCK_SIZE;
2674     grabEMEntryTable(WRITE);
2675     grabFreeList(WRITE);
2676 
2677     if (fEMShminfo->currentSize == fEMShminfo->allocdSize)
2678         growEMShmseg();
2679 
2680 //  size is the number of multiples of 1024 blocks.
2681 //  ex: size=1 --> 1024 blocks
2682 //      size=2 --> 2048 blocks
2683 //      size=3 --> 3072 blocks, etc.
2684     uint32_t size = EXTENT_SIZE / 1024;
2685 
2686     lbid = _createColumnExtentExactFile(size, OID, colWidth,
2687                                         dbRoot, partitionNum, segmentNum, colDataType, startBlockOffset);
2688 
2689     allocdsize = EXTENT_SIZE;
2690 }
2691 
2692 //------------------------------------------------------------------------------
2693 // Creates an extent for the exact segment file specified by the requested
2694 // OID, DBRoot, partition, and segment.  This is the internal implementation
2695 // function.
2696 // input:
2697 //   size         - number of multiples of 1024 blocks allocated to the extent
2698 //                  ex: size=1 --> 1024 blocks
2699 //                      size=2 --> 2048 blocks
2700 //                      size=3 --> 3072 blocks, etc.
2701 //   OID          - column OID for which the extent is to be created
2702 //   colWidth     - width of column in bytes
2703 //   dbRoot       - dbRoot where extent is to be added
2704 //   partitionNum - partitionNum
2705 //   segmentNum   - segmentNum
2706 // output:
2707 //   startBlockOffset-starting block of the created extent
2708 // returns starting LBID of the created extent.
2709 //------------------------------------------------------------------------------
_createColumnExtentExactFile(uint32_t size,int OID,uint32_t colWidth,uint16_t dbRoot,uint32_t partitionNum,uint16_t segmentNum,execplan::CalpontSystemCatalog::ColDataType colDataType,uint32_t & startBlockOffset)2710 LBID_t ExtentMap::_createColumnExtentExactFile(uint32_t size, int OID,
2711         uint32_t  colWidth,
2712         uint16_t  dbRoot,
2713         uint32_t  partitionNum,
2714         uint16_t  segmentNum,
2715         execplan::CalpontSystemCatalog::ColDataType colDataType,
2716         uint32_t& startBlockOffset)
2717 {
2718     int emptyEMEntry        = -1;
2719     int lastExtentIndex     = -1;
2720     uint32_t highestOffset = 0;
2721 
2722     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
2723     LBID_t startLBID = getLBIDsFromFreeList( size );
2724 
2725     // Find the first empty Entry; and find the last extent for this
2726     // combination of OID, partition, and segment.
2727     for (int i = 0; i < emEntries; i++)
2728     {
2729         if (fExtentMap[i].range.size != 0)
2730         {
2731             if (fExtentMap[i].fileID == OID)
2732             {
2733                 if ((fExtentMap[i].dbRoot       == dbRoot) &&
2734                         (fExtentMap[i].partitionNum == partitionNum) &&
2735                         (fExtentMap[i].segmentNum   == segmentNum) &&
2736                         (fExtentMap[i].blockOffset  >= highestOffset))
2737                 {
2738                     lastExtentIndex = i;
2739                     highestOffset = fExtentMap[i].blockOffset;
2740                 }
2741             }
2742         }
2743         else if (emptyEMEntry < 0)
2744             emptyEMEntry = i;
2745     } // Loop through extent map entries
2746 
2747     if (emptyEMEntry == -1)
2748     {
2749         ostringstream oss;
2750         oss << "ExtentMap::_createColumnExtentExactFile(): "
2751             "could not find an empty EMEntry for OID " << OID <<
2752             "; Extent Map is full",
2753             log(oss.str(),
2754                 logging::LOG_TYPE_CRITICAL);
2755         throw logic_error( oss.str() );
2756     }
2757 
2758     makeUndoRecord(&fExtentMap[emptyEMEntry], sizeof(EMEntry));
2759     EMEntry* e      = &fExtentMap[emptyEMEntry];
2760 
2761     e->range.start  = startLBID;
2762     e->range.size   = size;
2763     e->fileID       = OID;
2764 
2765     if (isUnsigned(colDataType))
2766     {
2767         e->partition.cprange.lo_val = numeric_limits<uint64_t>::max();
2768         e->partition.cprange.hi_val = 0;
2769     }
2770     else
2771     {
2772         e->partition.cprange.lo_val = numeric_limits<int64_t>::max();
2773         e->partition.cprange.hi_val = numeric_limits<int64_t>::min();
2774     }
2775 
2776     e->partition.cprange.sequenceNum = 0;
2777 
2778     e->colWid       = colWidth;
2779 
2780     e->dbRoot       = dbRoot;
2781     e->partitionNum = partitionNum;
2782     e->segmentNum   = segmentNum;
2783     e->status       = EXTENTUNAVAILABLE; // mark extent as in process
2784 
2785     // If first extent for this OID, partition, dbroot, and segment then
2786     //   blockOffset is set to 0
2787     // else
2788     //   blockOffset is extrapolated from the last extent
2789     if (lastExtentIndex == -1)
2790     {
2791         e->blockOffset  = 0;
2792         e->HWM          = 0;
2793     }
2794     else
2795     {
2796         e->blockOffset  = static_cast<uint64_t>
2797                           (fExtentMap[lastExtentIndex].range.size) * 1024 +
2798                           fExtentMap[lastExtentIndex].blockOffset;
2799         e->HWM          = 0;
2800     }
2801 
2802     // Partition, segment, and blockOffset 0 represents new table or column.
2803     // When DDL creates a table, we can mark the first extent as VALID, since
2804     // the table has no data.  Marking as VALID enables cpimport to update
2805     // the CP min/max for the first import.
2806     // If DDL is adding a column to an existing table, setting to VALID won't
2807     // hurt, because DDL resets to INVALID after the extent is created.
2808     if ((e->partitionNum == 0) &&
2809             (e->segmentNum   == 0) &&
2810             (e->blockOffset  == 0))
2811         e->partition.cprange.isValid = CP_VALID;
2812     else
2813         e->partition.cprange.isValid = CP_INVALID;
2814 
2815     startBlockOffset = e->blockOffset;
2816 
2817     makeUndoRecord(fEMShminfo, sizeof(MSTEntry));
2818     fEMShminfo->currentSize += sizeof(struct EMEntry);
2819 
2820     return startLBID;
2821 }
2822 
2823 //------------------------------------------------------------------------------
2824 // Creates an extent for a dictionary store file.  This is the external API
2825 // function.
2826 // input:
2827 //   OID          - column OID for which the extent is to be created
2828 //   dbRoot       - DBRoot to be assigned to the new extent
2829 //   partitionNum - partition number to be assigned to the new extent
2830 //   segmentNum   - segment number to be assigned to the new extent
2831 // output:
2832 //   lbid         - starting LBID of the created extent
2833 //   allocdsize   - number LBIDs of allocated
2834 //------------------------------------------------------------------------------
createDictStoreExtent(int OID,uint16_t dbRoot,uint32_t partitionNum,uint16_t segmentNum,LBID_t & lbid,int & allocdsize)2835 void ExtentMap::createDictStoreExtent(int OID,
2836                                       uint16_t  dbRoot,
2837                                       uint32_t  partitionNum,
2838                                       uint16_t  segmentNum,
2839                                       LBID_t&    lbid,
2840                                       int&       allocdsize)
2841 {
2842 #ifdef BRM_INFO
2843 
2844     if (fDebug)
2845     {
2846         TRACER_WRITELATER("createDictStoreExtent");
2847         TRACER_ADDINPUT(OID);
2848         TRACER_ADDSHORTINPUT(dbRoot);
2849         TRACER_ADDINPUT(partitionNum);
2850         TRACER_ADDSHORTINPUT(segmentNum);
2851         TRACER_ADDINT64OUTPUT(lbid);
2852         TRACER_ADDOUTPUT(allocdsize);
2853         TRACER_WRITE;
2854     }
2855 
2856 #endif
2857 
2858 #ifdef BRM_DEBUG
2859 
2860     if (OID <= 0)
2861     {
2862         log("ExtentMap::createDictStoreExtent(): OID must be > 0",
2863             logging::LOG_TYPE_DEBUG);
2864         throw invalid_argument(
2865             "ExtentMap::createDictStoreExtent(): OID must be > 0");
2866     }
2867 
2868 #endif
2869 
2870     // Convert extent size in rows to extent size in 8192-byte blocks.
2871     // extentRows should be multiple of blocksize (8192).
2872     const unsigned EXTENT_SIZE = (getExtentRows() * DICT_COL_WIDTH) / BLOCK_SIZE;
2873 
2874     grabEMEntryTable(WRITE);
2875     grabFreeList(WRITE);
2876 
2877     if (fEMShminfo->currentSize == fEMShminfo->allocdSize)
2878         growEMShmseg();
2879 
2880 //  size is the number of multiples of 1024 blocks.
2881 //  ex: size=1 --> 1024 blocks
2882 //      size=2 --> 2048 blocks
2883 //      size=3 --> 3072 blocks, etc.
2884     uint32_t size = EXTENT_SIZE / 1024;
2885 
2886     lbid = _createDictStoreExtent(size, OID,
2887                                   dbRoot, partitionNum, segmentNum);
2888 
2889     allocdsize = EXTENT_SIZE;
2890 }
2891 
2892 //------------------------------------------------------------------------------
2893 // Creates an extent for a dictionary store file.  This is the internal
2894 // implementation function.
2895 // input:
2896 //   size         - number of multiples of 1024 blocks allocated to the extent
2897 //                  ex: size=1 --> 1024 blocks
2898 //                      size=2 --> 2048 blocks
2899 //                      size=3 --> 3072 blocks, etc.
2900 //   OID          - column OID for which the extent is to be created
2901 //   dbRoot       - DBRoot to be assigned to the new extent
2902 //   partitionNum - partition number to be assigned to the new extent
2903 //   segmentNum   - segment number to be assigned to the new extent
2904 // returns starting LBID of the created extent.
2905 //------------------------------------------------------------------------------
_createDictStoreExtent(uint32_t size,int OID,uint16_t dbRoot,uint32_t partitionNum,uint16_t segmentNum)2906 LBID_t ExtentMap::_createDictStoreExtent(uint32_t size, int OID,
2907         uint16_t  dbRoot,
2908         uint32_t  partitionNum,
2909         uint16_t  segmentNum)
2910 {
2911     int emptyEMEntry        = -1;
2912     int lastExtentIndex     = -1;
2913     uint32_t highestOffset = 0;
2914 
2915     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
2916 
2917     LBID_t startLBID = getLBIDsFromFreeList( size );
2918 
2919     // Find the first empty Entry; and find the last extent for this
2920     // combination of OID, partition, and segment.
2921     for (int i = 0; i < emEntries; i++)
2922     {
2923         if (fExtentMap[i].range.size != 0)
2924         {
2925             if ((fExtentMap[i].fileID       == OID) &&
2926                     (fExtentMap[i].partitionNum == partitionNum) &&
2927                     (fExtentMap[i].segmentNum   == segmentNum) &&
2928                     (fExtentMap[i].blockOffset  >= highestOffset))
2929             {
2930                 lastExtentIndex = i;
2931                 highestOffset = fExtentMap[i].blockOffset;
2932             }
2933         }
2934         else if (emptyEMEntry < 0)
2935             emptyEMEntry = i;
2936     } // Loop through extent map entries
2937 
2938     if (emptyEMEntry == -1)
2939     {
2940         ostringstream oss;
2941         oss << "ExtentMap::_createDictStoreExtent(): "
2942             "could not find an empty EMEntry for OID " << OID <<
2943             "; Extent Map is full",
2944             log(oss.str(),
2945                 logging::LOG_TYPE_CRITICAL);
2946         throw logic_error( oss.str() );
2947     }
2948 
2949     makeUndoRecord(&fExtentMap[emptyEMEntry], sizeof(EMEntry));
2950     EMEntry* e      = &fExtentMap[emptyEMEntry];
2951 
2952     e->range.start  = startLBID;
2953     e->range.size   = size;
2954     e->fileID       = OID;
2955     e->status       = EXTENTUNAVAILABLE;// @bug 1911 mark extent as in process
2956     e->partition.cprange.lo_val = numeric_limits<int64_t>::max();
2957     e->partition.cprange.hi_val = numeric_limits<int64_t>::min();
2958     e->partition.cprange.sequenceNum = 0;
2959     e->partition.cprange.isValid     = CP_INVALID;
2960 
2961     // If this is first extent for this OID, partition, segment then
2962     //   everything is set to 0 or taken from user input
2963     // else
2964     //   everything is extrapolated from the last extent
2965     if (lastExtentIndex == -1)
2966     {
2967         e->blockOffset  = 0;
2968         e->HWM          = 0;
2969         e->segmentNum   = segmentNum;
2970         e->partitionNum = partitionNum;
2971         e->dbRoot       = dbRoot;
2972         e->colWid       = 0; // we don't store col width for dictionaries;
2973         // this helps to flag this as a dictionary extent
2974     }
2975     else
2976     {
2977         e->blockOffset  = static_cast<uint64_t>
2978                           (fExtentMap[lastExtentIndex].range.size) * 1024 +
2979                           fExtentMap[lastExtentIndex].blockOffset;
2980         e->HWM          = 0;
2981         e->segmentNum   = fExtentMap[lastExtentIndex].segmentNum;
2982         e->partitionNum = fExtentMap[lastExtentIndex].partitionNum;
2983         e->dbRoot       = fExtentMap[lastExtentIndex].dbRoot;
2984         e->colWid       = fExtentMap[lastExtentIndex].colWid;
2985     }
2986 
2987     makeUndoRecord(fEMShminfo, sizeof(MSTEntry));
2988     fEMShminfo->currentSize += sizeof(struct EMEntry);
2989 
2990     return startLBID;
2991 }
2992 
2993 //------------------------------------------------------------------------------
2994 // Finds and returns the starting LBID for an LBID range taken from the
2995 // free list.
2996 // input:
2997 //   size - number of multiples of 1024 blocks needed from the free list
2998 //          ex: size=1 --> 1024 blocks
2999 //              size=2 --> 2048 blocks
3000 //              size=3 --> 3072 blocks, etc.
3001 // returns selected starting LBID.
3002 //------------------------------------------------------------------------------
getLBIDsFromFreeList(uint32_t size)3003 LBID_t ExtentMap::getLBIDsFromFreeList ( uint32_t size )
3004 {
3005     LBID_t ret = -1;
3006     int i;
3007     int flEntries = fFLShminfo->allocdSize / sizeof(InlineLBIDRange);
3008 
3009     for (i = 0; i < flEntries; i++)
3010     {
3011         if (size <= fFreeList[i].size)
3012         {
3013             makeUndoRecord(&fFreeList[i], sizeof(InlineLBIDRange));
3014             ret = fFreeList[i].start;
3015             fFreeList[i].start += size * 1024;
3016             fFreeList[i].size -= size;
3017 
3018             if (fFreeList[i].size == 0)
3019             {
3020                 makeUndoRecord(fFLShminfo, sizeof(MSTEntry));
3021                 fFLShminfo->currentSize -= sizeof(InlineLBIDRange);
3022             }
3023 
3024             break;
3025         }
3026     }
3027 
3028     if (i == flEntries)
3029     {
3030         log("ExtentMap::getLBIDsFromFreeList(): out of LBID space");
3031         throw runtime_error(
3032             "ExtentMap::getLBIDsFromFreeList(): out of LBID space");
3033     }
3034 
3035     return ret;
3036 }
3037 
3038 #ifdef BRM_DEBUG
printEM(const EMEntry & em) const3039 void ExtentMap::printEM(const EMEntry& em) const
3040 {
3041     cout << " Start "
3042          << em.range.start << " Size "
3043          << (long) em.range.size << " OID "
3044          << (long) em.fileID << " offset "
3045          << (long) em.blockOffset
3046          << " LV " << em.partition.cprange.lo_val
3047          << " HV " << em.partition.cprange.hi_val;
3048     cout << endl;
3049 }
3050 
3051 
printEM(const OID_t & oid) const3052 void ExtentMap::printEM(const OID_t& oid) const
3053 {
3054     int emEntries = 0;
3055 
3056     if (fEMShminfo)
3057         emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
3058 
3059     cout << "Extent Map (OID=" << oid << ")" << endl;
3060 
3061     for (int idx = 0; idx < emEntries ; idx++)
3062     {
3063         struct EMEntry& em = fExtentMap[idx];
3064 
3065         if (em.fileID == oid && em.range.size != 0)
3066             printEM(em);
3067     }
3068 
3069     cout << endl;
3070 }
3071 
printEM() const3072 void ExtentMap::printEM() const
3073 {
3074 
3075     int emEntries = 0;
3076 
3077     if (fEMShminfo)
3078         emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
3079 
3080     cout << "Extent Map (" << emEntries << ")" << endl;
3081 
3082     for (int idx = 0; idx < emEntries ; idx++)
3083     {
3084         struct EMEntry& em = fExtentMap[idx];
3085 
3086         if (em.range.size != 0)
3087             printEM(em);
3088     }
3089 
3090     cout << endl;
3091 }
3092 
printFL() const3093 void ExtentMap::printFL() const
3094 {
3095 
3096     int flEntries = 0;
3097 
3098     if (fFLShminfo)
3099         flEntries = fFLShminfo->allocdSize / sizeof(InlineLBIDRange);
3100 
3101     cout << "Free List" << endl;
3102 
3103     for (int idx = 0; idx < flEntries; idx++)
3104     {
3105 
3106         cout << idx << " "
3107              << fFreeList[idx].start << " "
3108              << fFreeList[idx].size
3109              << endl;
3110     }
3111 
3112     cout << endl;
3113 }
3114 #endif
3115 
3116 //------------------------------------------------------------------------------
3117 // Rollback (delete) the extents that logically follow the specified extent for
3118 // the given OID and DBRoot.  HWM for the last extent is reset to the specified
3119 // value.
3120 // input:
3121 //   oid          - OID of the last logical extent to be retained
3122 //   bDeleteAll   - Flag indicates whether all extents for oid and dbroot are
3123 //                  to be deleted; else part#, seg#, and hwm are used.
3124 //   dbRoot       - DBRoot of the extents to be considered.
3125 //   partitionNum - partition number of the last logical extent to be retained
3126 //   segmentNum   - segment number of the last logical extent to be retained
3127 //   hwm          - HWM to be assigned to the last logical extent retained
3128 //------------------------------------------------------------------------------
rollbackColumnExtents_DBroot(int oid,bool bDeleteAll,uint16_t dbRoot,uint32_t partitionNum,uint16_t segmentNum,HWM_t hwm)3129 void ExtentMap::rollbackColumnExtents_DBroot ( int oid,
3130         bool      bDeleteAll,
3131         uint16_t dbRoot,
3132         uint32_t partitionNum,
3133         uint16_t segmentNum,
3134         HWM_t     hwm)
3135 {
3136     //bool oidExists = false;
3137 
3138 #ifdef BRM_INFO
3139     if (fDebug)
3140     {
3141         TRACER_WRITELATER("rollbackColumnExtents");
3142         TRACER_ADDINPUT(oid);
3143         TRACER_ADDBOOLINPUT(bDeleteAll);
3144         TRACER_ADDSHORTINPUT(dbRoot);
3145         TRACER_ADDINPUT(partitionNum);
3146         TRACER_ADDSHORTINPUT(segmentNum);
3147         TRACER_ADDINPUT(hwm);
3148         TRACER_WRITE;
3149     }
3150 
3151 #endif
3152 
3153 #ifdef BRM_DEBUG
3154 
3155     if (oid < 0)
3156     {
3157         log("ExtentMap::rollbackColumnExtents_DBroot(): OID must be >= 0",
3158             logging::LOG_TYPE_DEBUG);
3159         throw invalid_argument(
3160             "ExtentMap::rollbackColumnExtents_DBroot(): OID must be >= 0");
3161     }
3162 
3163 #endif
3164 
3165     uint32_t fboLo = 0;
3166     uint32_t fboHi = 0;
3167     uint32_t fboLoPreviousStripe = 0;
3168 
3169     grabEMEntryTable(WRITE);
3170     grabFreeList(WRITE);
3171 
3172     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
3173 
3174     for (int i = 0; i < emEntries; i++)
3175     {
3176         if ((fExtentMap[i].range.size  != 0) &&
3177                 (fExtentMap[i].fileID      == oid) &&
3178                 (fExtentMap[i].dbRoot      == dbRoot))
3179         {
3180 
3181             //oidExists = true;
3182 
3183             // Don't rollback extents that are out of service
3184             if (fExtentMap[i].status == EXTENTOUTOFSERVICE)
3185                 continue;
3186 
3187             // If bDeleteAll is true, then we delete extent w/o regards to
3188             // partition number, segment number, or HWM
3189             if (bDeleteAll)
3190             {
3191                 deleteExtent( i );                                     // case 0
3192                 continue;
3193             }
3194 
3195             // Calculate fbo range for the stripe containing the given hwm
3196             if (fboHi == 0)
3197             {
3198                 uint32_t range = fExtentMap[i].range.size * 1024;
3199                 fboLo = hwm - (hwm % range);
3200                 fboHi = fboLo + range - 1;
3201 
3202                 if (fboLo > 0)
3203                     fboLoPreviousStripe = fboLo - range;
3204             }
3205 
3206             // Delete, update, or ignore this extent:
3207             // Later partition:
3208             //   case 1: extent in later partition than last extent, so delete
3209             // Same partition:
3210             //   case 2: extent is in later stripe than last extent, so delete
3211             //   case 3: extent is in earlier stripe in the same partition.
3212             //           No action necessary for case3B and case3C.
3213             //     case 3A: extent is in trailing segment in previous stripe.
3214             //              This extent is now the last extent in that segment
3215             //              file, so reset the local HWM if it was altered.
3216             //     case 3B: extent in previous stripe but not a trailing segment
3217             //     case 3C: extent is in stripe that precedes previous stripe
3218             //   case 4: extent is in the same partition and stripe as the
3219             //           last logical extent we are to keep.
3220             //     case 4A: extent is in later segment so can be deleted
3221             //     case 4B: extent is in earlier segment, reset HWM if changed
3222             //     case 4C: this is last logical extent, reset HWM if changed
3223             // Earlier partition:
3224             //   case 5: extent is in earlier parition, no action necessary
3225 
3226             if (fExtentMap[i].partitionNum > partitionNum)
3227             {
3228                 deleteExtent( i );                                     // case 1
3229             }
3230             else if (fExtentMap[i].partitionNum == partitionNum)
3231             {
3232                 if (fExtentMap[i].blockOffset > fboHi)
3233                 {
3234                     deleteExtent( i );                                 // case 2
3235                 }
3236                 else if (fExtentMap[i].blockOffset < fboLo)
3237                 {
3238                     if (fExtentMap[i].blockOffset >= fboLoPreviousStripe)
3239                     {
3240                         if (fExtentMap[i].segmentNum > segmentNum)
3241                         {
3242                             if (fExtentMap[i].HWM != (fboLo - 1))
3243                             {
3244                                 makeUndoRecord(&fExtentMap[i], sizeof(EMEntry));
3245                                 fExtentMap[i].HWM    = fboLo - 1;      //case 3A
3246                                 fExtentMap[i].status = EXTENTAVAILABLE;
3247                             }
3248                         }
3249                         else
3250                         {
3251                             // not a trailing segment in prev stripe     case 3B
3252                         }
3253                     }
3254                     else
3255                     {
3256                         // extent precedes previous stripe               case 3C
3257                     }
3258                 }
3259                 else   // extent is in same stripe
3260                 {
3261                     if (fExtentMap[i].segmentNum > segmentNum)
3262                     {
3263                         deleteExtent( i );                            // case 4A
3264                     }
3265                     else if (fExtentMap[i].segmentNum < segmentNum)
3266                     {
3267                         if (fExtentMap[i].HWM != fboHi)
3268                         {
3269                             makeUndoRecord(&fExtentMap[i], sizeof(EMEntry));
3270                             fExtentMap[i].HWM    = fboHi;             // case 4B
3271                             fExtentMap[i].status = EXTENTAVAILABLE;
3272                         }
3273                     }
3274                     else   // fExtentMap[i].segmentNum == segmentNum
3275                     {
3276                         if (fExtentMap[i].HWM != hwm)
3277                         {
3278                             makeUndoRecord(&fExtentMap[i], sizeof(EMEntry));
3279                             fExtentMap[i].HWM    = hwm;               // case 4C
3280                             fExtentMap[i].status = EXTENTAVAILABLE;
3281                         }
3282                     }
3283                 }
3284             }
3285             else
3286             {
3287                 // extent in earlier partition; no action necessary       case 5
3288             }
3289         }  // extent map entry with matching oid
3290     }      // loop through the extent map
3291 
3292     // If this function is called, we are already in error recovery mode; so
3293     // don't worry about reporting an error if the OID is not found, because
3294     // we don't want/need the extents for that OID anyway.
3295     //if (!oidExists)
3296     //{
3297     //	ostringstream oss;
3298     //	oss << "ExtentMap::rollbackColumnExtents_DBroot(): "
3299     //		"Rollback failed: no extents exist for: OID-" << oid <<
3300     //		"; dbRoot-"    << dbRoot       <<
3301     //		"; partition-" << partitionNum <<
3302     //		"; segment-"   << segmentNum   <<
3303     //		"; hwm-"       << hwm;
3304     //	log(oss.str(), logging::LOG_TYPE_CRITICAL);
3305     //	throw invalid_argument(oss.str());
3306     //}
3307 }
3308 
3309 //------------------------------------------------------------------------------
3310 // Rollback (delete) the extents that follow the extents in partitionNum,
3311 // for the given dictionary OID & DBRoot.  The specified hwms represent the HWMs
3312 // to be reset for each of segment store file in this partition.  An HWM will
3313 // not be given for "every" segment file if we are rolling back to a point where
3314 // we had not yet created all the segment files in the partition.  In any case,
3315 // any extents for the "oid" that follow partitionNum, should be deleted.
3316 // Likewise, any extents in the same partition, whose segment file is not in
3317 // segNums[], should be deleted as well.  If hwms is empty, then this DBRoot
3318 // must have been empty at the start of the job, so all the extents for the
3319 // specified oid and dbRoot can be deleted.
3320 // input:
3321 //   oid          - OID of the "last" extents to be retained
3322 //   dbRoot       - DBRoot of the extents to be considered.
3323 //   partitionNum - partition number of the last extents to be retained
3324 //   segNums      - list of segment files with extents to be restored
3325 //   hwms         - HWMs to be assigned to the last retained extent in each of
3326 //                      the corresponding segment store files in segNums.
3327 //                  hwms[0] applies to segment store file segNums[0];
3328 //                  hwms[1] applies to segment store file segNums[1]; etc.
3329 //------------------------------------------------------------------------------
rollbackDictStoreExtents_DBroot(int oid,uint16_t dbRoot,uint32_t partitionNum,const vector<uint16_t> & segNums,const vector<HWM_t> & hwms)3330 void ExtentMap::rollbackDictStoreExtents_DBroot ( int oid,
3331         uint16_t            dbRoot,
3332         uint32_t            partitionNum,
3333         const vector<uint16_t>& segNums,
3334         const vector<HWM_t>& hwms)
3335 {
3336     //bool oidExists = false;
3337 
3338 #ifdef BRM_INFO
3339     if (fDebug)
3340     {
3341         ostringstream oss;
3342 
3343         for (unsigned int k = 0; k < hwms.size(); k++)
3344             oss << "; hwms[" << k << "]-"  << hwms[k];
3345 
3346         const string& hwmString(oss.str());
3347 
3348         // put TRACE inside separate scope {} to insure that temporary
3349         // hwmString still exists when tracer destructor tries to print it.
3350         {
3351             TRACER_WRITELATER("rollbackDictStoreExtents_DBroot");
3352             TRACER_ADDINPUT(oid);
3353             TRACER_ADDSHORTINPUT(dbRoot);
3354             TRACER_ADDINPUT(partitionNum);
3355             TRACER_ADDSTRINPUT(hwmString);
3356             TRACER_WRITE;
3357         }
3358     }
3359 
3360 #endif
3361 
3362     // Delete all extents for the specified OID and DBRoot,
3363     // if we are not given any hwms and segment files.
3364     bool bDeleteAll = false;
3365 
3366     if (hwms.size() == 0)
3367         bDeleteAll = true;
3368 
3369     // segToHwmMap maps segment file number to corresponding pair<hwm,fboLo>
3370     tr1::unordered_map<uint16_t, pair<uint32_t, uint32_t> > segToHwmMap;
3371     tr1::unordered_map<uint16_t, pair<uint32_t, uint32_t> >::const_iterator
3372     segToHwmMapIter;
3373 
3374     grabEMEntryTable(WRITE);
3375     grabFreeList(WRITE);
3376 
3377     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
3378 
3379     for (int i = 0; i < emEntries; i++)
3380     {
3381         if ((fExtentMap[i].range.size  != 0) &&
3382                 (fExtentMap[i].fileID      == oid) &&
3383                 (fExtentMap[i].dbRoot      == dbRoot))
3384         {
3385 
3386             //oidExists = true;
3387 
3388             // Don't rollback extents that are out of service
3389             if (fExtentMap[i].status == EXTENTOUTOFSERVICE)
3390                 continue;
3391 
3392             // If bDeleteAll is true, then we delete extent w/o regards to
3393             // partition number, segment number, or HWM
3394             if (bDeleteAll)
3395             {
3396                 deleteExtent( i );                                     // case 0
3397                 continue;
3398             }
3399 
3400             // Calculate fbo's for the list of hwms we are given; and store
3401             // the fbo and hwm in a map, using the segment file number as a key.
3402             if (segToHwmMap.size() == 0)
3403             {
3404                 uint32_t range = fExtentMap[i].range.size * 1024;
3405                 pair<uint32_t, uint32_t> segToHwmMapEntry;
3406 
3407                 for (unsigned int k = 0; k < hwms.size(); k++)
3408                 {
3409                     uint32_t fboLo = hwms[k] - (hwms[k] % range);
3410                     segToHwmMapEntry.first    = hwms[k];
3411                     segToHwmMapEntry.second   = fboLo;
3412                     segToHwmMap[ segNums[k] ] = segToHwmMapEntry;
3413                 }
3414             }
3415 
3416             // Delete, update, or ignore this extent:
3417             // Later partition:
3418             //   case 1: extent is in later partition, so delete the extent
3419             // Same partition:
3420             //   case 2: extent is in trailing seg file we don't need; so delete
3421             //   case 3: extent is in partition and segment file of interest
3422             //     case 3A: earlier extent in segment file; no action necessary
3423             //     case 3B: specified HWM falls in this extent, so reset HWM
3424             //     case 3C: later extent in segment file; so delete the extent
3425             // Earlier partition:
3426             //   case 4: extent is in earlier parition, no action necessary
3427 
3428             if (fExtentMap[i].partitionNum > partitionNum)
3429             {
3430                 deleteExtent( i );                                     // case 1
3431             }
3432             else if (fExtentMap[i].partitionNum == partitionNum)
3433             {
3434                 unsigned segNum = fExtentMap[i].segmentNum;
3435                 segToHwmMapIter = segToHwmMap.find( segNum );
3436 
3437                 if (segToHwmMapIter == segToHwmMap.end())
3438                 {
3439                     deleteExtent( i );                                 // case 2
3440                 }
3441                 else   // segment number in the map of files to keep
3442                 {
3443                     uint32_t fboLo = segToHwmMapIter->second.second;
3444 
3445                     if (fExtentMap[i].blockOffset < fboLo)
3446                     {
3447                         // no action necessary                           case 3A
3448                     }
3449                     else if (fExtentMap[i].blockOffset == fboLo)
3450                     {
3451                         uint32_t hwm = segToHwmMapIter->second.first;
3452 
3453                         if (fExtentMap[i].HWM != hwm)
3454                         {
3455                             makeUndoRecord(&fExtentMap[i], sizeof(EMEntry));
3456                             fExtentMap[i].HWM  = hwm;
3457                             fExtentMap[i].status = EXTENTAVAILABLE;   // case 3B
3458                         }
3459                     }
3460                     else
3461                     {
3462                         deleteExtent( i );                            // case 3C
3463                     }
3464                 }
3465             }
3466             else
3467             {
3468                 // extent in earlier partition; no action necessary       case 4
3469             }
3470         }  // extent map entry with matching oid
3471     }      // loop through the extent map
3472 
3473     // If this function is called, we are already in error recovery mode; so
3474     // don't worry about reporting an error if the OID is not found, because
3475     // we don't want/need the extents for that OID anyway.
3476     //if (!oidExists)
3477     //{
3478     //	ostringstream oss;
3479     //	oss << "ExtentMap::rollbackDictStoreExtents_DBroot(): "
3480     //		"Rollback failed: no extents exist for: OID-" << oid <<
3481     //		"; dbRoot-"    << dbRoot       <<
3482     //		"; partition-" << partitionNum;
3483     //	log(oss.str(), logging::LOG_TYPE_CRITICAL);
3484     //	throw invalid_argument(oss.str());
3485     //}
3486 }
3487 
3488 //------------------------------------------------------------------------------
3489 // Delete the extents specified and reset hwm
3490 //------------------------------------------------------------------------------
deleteEmptyColExtents(const ExtentsInfoMap_t & extentsInfo)3491 void ExtentMap::deleteEmptyColExtents(const ExtentsInfoMap_t& extentsInfo)
3492 {
3493 #ifdef BRM_INFO
3494 
3495     if (fDebug)
3496     {
3497         TRACER_WRITELATER("deleteEmptyColExtents");
3498         TRACER_WRITE;
3499     }
3500 
3501 #endif
3502 
3503     grabEMEntryTable(WRITE);
3504     grabFreeList(WRITE);
3505 
3506     uint32_t fboLo = 0;
3507     uint32_t fboHi = 0;
3508     uint32_t fboLoPreviousStripe = 0;
3509 
3510     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
3511     ExtentsInfoMap_t::const_iterator it;
3512 
3513     for (int i = 0; i < emEntries; i++)
3514     {
3515         if (fExtentMap[i].range.size  != 0)
3516         {
3517             it = extentsInfo.find ( fExtentMap[i].fileID );
3518 
3519             if ( it != extentsInfo.end() )
3520             {
3521                 // Don't rollback extents that are out of service
3522                 if (fExtentMap[i].status == EXTENTOUTOFSERVICE)
3523                     continue;
3524 
3525                 // Calculate fbo range for the stripe containing the given hwm
3526                 if (fboHi == 0)
3527                 {
3528                     uint32_t range = fExtentMap[i].range.size * 1024;
3529                     fboLo = it->second.hwm - (it->second.hwm % range);
3530                     fboHi = fboLo + range - 1;
3531 
3532                     if (fboLo > 0)
3533                         fboLoPreviousStripe = fboLo - range;
3534                 }
3535 
3536                 // Delete, update, or ignore this extent:
3537                 // Later partition:
3538                 //   case 1: extent in later partition than last extent, so delete
3539                 // Same partition:
3540                 //   case 2: extent is in later stripe than last extent, so delete
3541                 //   case 3: extent is in earlier stripe in the same partition.
3542                 //           No action necessary for case3B and case3C.
3543                 //     case 3A: extent is in trailing segment in previous stripe.
3544                 //              This extent is now the last extent in that segment
3545                 //              file, so reset the local HWM if it was altered.
3546                 //     case 3B: extent in previous stripe but not a trailing segment
3547                 //     case 3C: extent is in stripe that precedes previous stripe
3548                 //   case 4: extent is in the same partition and stripe as the
3549                 //           last logical extent we are to keep.
3550                 //     case 4A: extent is in later segment so can be deleted
3551                 //     case 4B: extent is in earlier segment, reset HWM if changed
3552                 //     case 4C: this is last logical extent, reset HWM if changed
3553                 // Earlier partition:
3554                 //   case 5: extent is in earlier parition, no action necessary
3555 
3556                 if (fExtentMap[i].partitionNum > it->second.partitionNum)
3557                 {
3558                     deleteExtent( i );                                 // case 1
3559                 }
3560                 else if (fExtentMap[i].partitionNum == it->second.partitionNum)
3561                 {
3562                     if (fExtentMap[i].blockOffset > fboHi)
3563                     {
3564                         deleteExtent( i );                             // case 2
3565                     }
3566                     else if (fExtentMap[i].blockOffset < fboLo)
3567                     {
3568                         if (fExtentMap[i].blockOffset >= fboLoPreviousStripe)
3569                         {
3570                             if (fExtentMap[i].segmentNum > it->second.segmentNum)
3571                             {
3572                                 if (fExtentMap[i].HWM != (fboLo - 1))
3573                                 {
3574                                     makeUndoRecord(&fExtentMap[i], sizeof(EMEntry));
3575                                     fExtentMap[i].HWM    = fboLo - 1;  //case 3A
3576                                     fExtentMap[i].status = EXTENTAVAILABLE;
3577                                 }
3578                             }
3579                             else
3580                             {
3581                                 // not a trailing segment in prev stripe     case 3B
3582                             }
3583                         }
3584                         else
3585                         {
3586                             // extent precedes previous stripe           case 3C
3587                         }
3588                     }
3589                     else
3590                     {
3591                         // extent is in same stripe
3592                         if (fExtentMap[i].segmentNum > it->second.segmentNum)
3593                         {
3594                             deleteExtent( i );                        // case 4A
3595                         }
3596                         else if (fExtentMap[i].segmentNum < it->second.segmentNum)
3597                         {
3598                             if (fExtentMap[i].HWM != fboHi)
3599                             {
3600                                 makeUndoRecord(&fExtentMap[i], sizeof(EMEntry));
3601                                 fExtentMap[i].HWM    = fboHi;         // case 4B
3602                                 fExtentMap[i].status = EXTENTAVAILABLE;
3603                             }
3604                         }
3605                         else
3606                         {
3607                             // fExtentMap[i].segmentNum == segmentNum
3608                             if (fExtentMap[i].HWM != it->second.hwm)
3609                             {
3610                                 makeUndoRecord(&fExtentMap[i], sizeof(EMEntry));
3611                                 fExtentMap[i].HWM    = it->second.hwm;// case 4C
3612                                 fExtentMap[i].status = EXTENTAVAILABLE;
3613                             }
3614                         }
3615                     }
3616                 }
3617                 else
3618                 {
3619                     // extent in earlier partition; no action necessary   case 5
3620                 }
3621             }  // extent map entry with matching oid
3622         }
3623     }      // loop through the extent map
3624 }
3625 
deleteEmptyDictStoreExtents(const ExtentsInfoMap_t & extentsInfo)3626 void ExtentMap::deleteEmptyDictStoreExtents(const ExtentsInfoMap_t& extentsInfo)
3627 {
3628 #ifdef BRM_INFO
3629 
3630     if (fDebug)
3631     {
3632         TRACER_WRITELATER("deleteEmptyDictStoreExtents");
3633         TRACER_WRITE;
3634     }
3635 
3636 #endif
3637 
3638     grabEMEntryTable(WRITE);
3639     grabFreeList(WRITE);
3640 
3641     ExtentsInfoMap_t::const_iterator it;
3642 
3643     uint32_t fboLo = 0;
3644     uint32_t fboHi = 0;
3645 
3646     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
3647     it  = extentsInfo.begin();
3648 
3649     if ( it->second.newFile ) //The extent is the new extent
3650     {
3651         for (int i = 0; i < emEntries; i++)
3652         {
3653             if (fExtentMap[i].range.size  != 0)
3654             {
3655                 it = extentsInfo.find ( fExtentMap[i].fileID );
3656 
3657                 if ( it != extentsInfo.end() )
3658                 {
3659                     if ((fExtentMap[i].partitionNum == it->second.partitionNum)
3660                             && (fExtentMap[i].segmentNum == it->second.segmentNum)
3661                             && (fExtentMap[i].dbRoot == it->second.dbRoot) )
3662                         deleteExtent( i );
3663                 }
3664             }
3665         }
3666     }
3667     else //The extent is the old one
3668     {
3669 
3670         for (int i = 0; i < emEntries; i++)
3671         {
3672             if (fExtentMap[i].range.size  != 0)
3673             {
3674                 it = extentsInfo.find ( fExtentMap[i].fileID );
3675 
3676                 if ( it != extentsInfo.end() )
3677                 {
3678                     // Don't rollback extents that are out of service
3679                     if (fExtentMap[i].status == EXTENTOUTOFSERVICE)
3680                         continue;
3681 
3682                     // Calculate fbo
3683                     if (fboHi == 0)
3684                     {
3685                         uint32_t range = fExtentMap[i].range.size * 1024;
3686                         fboLo = it->second.hwm - (it->second.hwm % range);
3687                         fboHi = fboLo + range - 1;
3688                     }
3689 
3690                     // Delete, update, or ignore this extent:
3691                     // Later partition:
3692                     //   case 1: extent is in later partition, so delete the extent
3693                     // Same partition:
3694                     //   case 2: extent is in partition and segment file of interest
3695                     //     case 2A: earlier extent in segment file; no action necessary
3696                     //     case 2B: specified HWM falls in this extent, so reset HWM
3697                     //     case 2C: later extent in segment file; so delete the extent
3698                     // Earlier partition:
3699                     //   case 3: extent is in earlier parition, no action necessary
3700 
3701                     if (fExtentMap[i].partitionNum > it->second.partitionNum)
3702                     {
3703                         deleteExtent( i );                                 // case 1
3704                     }
3705                     else if (fExtentMap[i].partitionNum == it->second.partitionNum)
3706                     {
3707                         if ( fExtentMap[i].segmentNum == it->second.segmentNum)
3708                         {
3709                             if (fExtentMap[i].blockOffset < fboLo)
3710                             {
3711                                 // no action necessary                           case 2A
3712                             }
3713                             else if (fExtentMap[i].blockOffset == fboLo)
3714                             {
3715                                 if (fExtentMap[i].HWM != it->second.hwm)
3716                                 {
3717                                     makeUndoRecord(&fExtentMap[i], sizeof(EMEntry));
3718                                     fExtentMap[i].HWM  = it->second.hwm;
3719                                     fExtentMap[i].status = EXTENTAVAILABLE;//case 2B
3720                                 }
3721                             }
3722                             else
3723                             {
3724                                 deleteExtent( i );                        // case 3C
3725                             }
3726                         }
3727                         else
3728                         {
3729                             // no action necessary
3730                         }
3731                     }
3732                     else
3733                     {
3734                         // extent in earlier partition; no action necessary       case 4
3735                     }
3736                 }  // extent map entry with matching oid
3737             }
3738         }      // loop through the extent map
3739     }
3740 }
3741 //------------------------------------------------------------------------------
3742 // Delete all the extents for the specified OID
3743 //------------------------------------------------------------------------------
deleteOID(int OID)3744 void ExtentMap::deleteOID(int OID)
3745 {
3746 #ifdef BRM_INFO
3747 
3748     if (fDebug)
3749     {
3750         TRACER_WRITELATER("deleteOID");
3751         TRACER_ADDINPUT(OID);
3752         TRACER_WRITE;
3753     }
3754 
3755 #endif
3756 
3757     bool OIDExists = false;
3758 
3759 #ifdef BRM_DEBUG
3760 
3761     if (OID < 0)
3762     {
3763         log("ExtentMap::deleteOID(): OID must be >= 0", logging::LOG_TYPE_DEBUG);
3764         throw invalid_argument("ExtentMap::deleteOID(): OID must be >= 0");
3765     }
3766 
3767 #endif
3768 
3769     grabEMEntryTable(WRITE);
3770     grabFreeList(WRITE);
3771 
3772     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
3773 
3774     for (int emIndex = 0; emIndex < emEntries; emIndex++)
3775     {
3776 
3777         if (fExtentMap[emIndex].range.size > 0 &&
3778                 fExtentMap[emIndex].fileID == OID)
3779         {
3780             OIDExists = true;
3781 
3782             deleteExtent( emIndex );
3783         }
3784     }
3785 
3786     if (!OIDExists)
3787     {
3788         ostringstream oss;
3789         oss << "ExtentMap::deleteOID(): There are no extent entries for OID " << OID << endl;
3790         log(oss.str(), logging::LOG_TYPE_CRITICAL);
3791         throw invalid_argument(oss.str());
3792     }
3793 }
3794 
3795 
3796 
3797 //------------------------------------------------------------------------------
3798 // Delete all the extents for the specified OIDs
3799 //------------------------------------------------------------------------------
deleteOIDs(const OidsMap_t & OIDs)3800 void ExtentMap::deleteOIDs(const OidsMap_t& OIDs)
3801 {
3802 #ifdef BRM_INFO
3803 
3804     if (fDebug)
3805     {
3806         TRACER_WRITELATER("deleteOIDs");
3807         TRACER_WRITE;
3808     }
3809 
3810 #endif
3811     grabEMEntryTable(WRITE);
3812     grabFreeList(WRITE);
3813     OidsMap_t::const_iterator it;
3814     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
3815 
3816     for (int emIndex = 0; emIndex < emEntries; emIndex++)
3817     {
3818         if (fExtentMap[emIndex].range.size > 0 )
3819         {
3820             it = OIDs.find ( fExtentMap[emIndex].fileID );
3821 
3822             if ( it != OIDs.end() )
3823                 deleteExtent( emIndex );
3824         }
3825     }
3826 }
3827 
3828 
3829 //------------------------------------------------------------------------------
3830 // Delete the specified extent from the extentmap and return to the free list.
3831 // emIndex - the index (from the extent map) of the extent to be deleted
3832 //------------------------------------------------------------------------------
deleteExtent(int emIndex)3833 void ExtentMap::deleteExtent(int emIndex)
3834 {
3835     int flIndex, freeFLIndex, flEntries, preceedingExtent, succeedingExtent;
3836     LBID_t flBlockEnd, emBlockEnd;
3837 
3838     flEntries = fFLShminfo->allocdSize / sizeof(InlineLBIDRange);
3839 
3840     emBlockEnd = fExtentMap[emIndex].range.start +
3841                  (static_cast<LBID_t>(fExtentMap[emIndex].range.size) * 1024);
3842 
3843     //scan the freelist to see where this entry fits in
3844     for (flIndex = 0, preceedingExtent = -1, succeedingExtent = -1, freeFLIndex = -1;
3845             flIndex < flEntries; flIndex++)
3846     {
3847         if (fFreeList[flIndex].size == 0)
3848             freeFLIndex = flIndex;
3849         else
3850         {
3851             flBlockEnd = fFreeList[flIndex].start +
3852                          (static_cast<LBID_t>(fFreeList[flIndex].size) * 1024);
3853 
3854             if (emBlockEnd == fFreeList[flIndex].start)
3855                 succeedingExtent = flIndex;
3856             else if (flBlockEnd == fExtentMap[emIndex].range.start)
3857                 preceedingExtent = flIndex;
3858         }
3859     }
3860 
3861     //update the freelist
3862 
3863     //this space is in between 2 blocks in the FL
3864     if (preceedingExtent != -1 && succeedingExtent != -1)
3865     {
3866         makeUndoRecord(&fFreeList[preceedingExtent], sizeof(InlineLBIDRange));
3867 
3868         // migrate the entry upward if there's a space
3869         if (freeFLIndex < preceedingExtent && freeFLIndex != -1)
3870         {
3871             makeUndoRecord(&fFreeList[freeFLIndex], sizeof(InlineLBIDRange));
3872             memcpy(&fFreeList[freeFLIndex], &fFreeList[preceedingExtent], sizeof(InlineLBIDRange));
3873             fFreeList[preceedingExtent].size = 0;
3874             preceedingExtent = freeFLIndex;
3875         }
3876 
3877         fFreeList[preceedingExtent].size += fFreeList[succeedingExtent].size +
3878                                             fExtentMap[emIndex].range.size;
3879         makeUndoRecord(&fFreeList[succeedingExtent], sizeof(InlineLBIDRange));
3880         fFreeList[succeedingExtent].size = 0;
3881         makeUndoRecord(fFLShminfo, sizeof(MSTEntry));
3882         fFLShminfo->currentSize -= sizeof(InlineLBIDRange);
3883     }
3884 
3885     //this space has a free block at the end
3886     else if (succeedingExtent != -1)
3887     {
3888         makeUndoRecord(&fFreeList[succeedingExtent], sizeof(InlineLBIDRange));
3889 
3890         // migrate the entry upward if there's a space
3891         if (freeFLIndex < succeedingExtent && freeFLIndex != -1)
3892         {
3893             makeUndoRecord(&fFreeList[freeFLIndex], sizeof(InlineLBIDRange));
3894             memcpy(&fFreeList[freeFLIndex], &fFreeList[succeedingExtent], sizeof(InlineLBIDRange));
3895             fFreeList[succeedingExtent].size = 0;
3896             succeedingExtent = freeFLIndex;
3897         }
3898 
3899         fFreeList[succeedingExtent].start = fExtentMap[emIndex].range.start;
3900         fFreeList[succeedingExtent].size += fExtentMap[emIndex].range.size;
3901     }
3902 
3903     //this space has a free block at the beginning
3904     else if (preceedingExtent != -1)
3905     {
3906         makeUndoRecord(&fFreeList[preceedingExtent], sizeof(InlineLBIDRange));
3907 
3908         // migrate the entry upward if there's a space
3909         if (freeFLIndex < preceedingExtent && freeFLIndex != -1)
3910         {
3911             makeUndoRecord(&fFreeList[freeFLIndex], sizeof(InlineLBIDRange));
3912             memcpy(&fFreeList[freeFLIndex], &fFreeList[preceedingExtent], sizeof(InlineLBIDRange));
3913             fFreeList[preceedingExtent].size = 0;
3914             preceedingExtent = freeFLIndex;
3915         }
3916 
3917         fFreeList[preceedingExtent].size += fExtentMap[emIndex].range.size;
3918     }
3919 
3920     //the freelist has no adjacent blocks, so make a new entry
3921     else
3922     {
3923         if (fFLShminfo->currentSize == fFLShminfo->allocdSize)
3924         {
3925             growFLShmseg();
3926 #ifdef BRM_DEBUG
3927 
3928             if (freeFLIndex != -1)
3929             {
3930                 log("ExtentMap::deleteOID(): found a free FL entry in a supposedly full shmseg", logging::LOG_TYPE_DEBUG);
3931                 throw logic_error("ExtentMap::deleteOID(): found a free FL entry in a supposedly full shmseg");
3932             }
3933 
3934 #endif
3935             freeFLIndex = flEntries;  // happens to be the right index
3936             flEntries = fFLShminfo->allocdSize / sizeof(InlineLBIDRange);
3937         }
3938 
3939 #ifdef BRM_DEBUG
3940 
3941         if (freeFLIndex == -1)
3942         {
3943             log("ExtentMap::deleteOID(): no available free list entries?", logging::LOG_TYPE_DEBUG);
3944             throw logic_error("ExtentMap::deleteOID(): no available free list entries?");
3945         }
3946 
3947 #endif
3948         makeUndoRecord(&fFreeList[freeFLIndex], sizeof(InlineLBIDRange));
3949         fFreeList[freeFLIndex].start = fExtentMap[emIndex].range.start;
3950         fFreeList[freeFLIndex].size = fExtentMap[emIndex].range.size;
3951         makeUndoRecord(&fFLShminfo, sizeof(MSTEntry));
3952         fFLShminfo->currentSize += sizeof(InlineLBIDRange);
3953     }
3954 
3955     //invalidate the entry in the Extent Map
3956     makeUndoRecord(&fExtentMap[emIndex], sizeof(EMEntry));
3957     fExtentMap[emIndex].range.size = 0;
3958     makeUndoRecord(&fEMShminfo, sizeof(MSTEntry));
3959     fEMShminfo->currentSize -= sizeof(struct EMEntry);
3960 }
3961 
3962 //------------------------------------------------------------------------------
3963 // Returns the last local HWM for the specified OID for the given DBroot.
3964 // Also returns the DBRoot, and partition, and segment numbers for the relevant
3965 // segment file. Technically, this function finds the "last" extent for the
3966 // specified OID, and returns the HWM for that extent.  It is assumed that the
3967 // HWM for the segment file containing this "last" extent, has been stored in
3968 // that extent's hwm; and that the hwm is not still hanging around in a previous
3969 // extent for the same segment file.
3970 // If no available or outOfService extent is found, then bFound is returned
3971 // as false.
3972 //------------------------------------------------------------------------------
getLastHWM_DBroot(int OID,uint16_t dbRoot,uint32_t & partitionNum,uint16_t & segmentNum,int & status,bool & bFound)3973 HWM_t ExtentMap::getLastHWM_DBroot(int OID, uint16_t dbRoot,
3974                                    uint32_t& partitionNum, uint16_t& segmentNum, int& status, bool& bFound)
3975 {
3976 #ifdef BRM_INFO
3977 
3978     if (fDebug)
3979     {
3980         TRACER_WRITELATER("getLastHWM_DBroot");
3981         TRACER_ADDINPUT(OID);
3982         TRACER_ADDSHORTINPUT(dbRoot);
3983         TRACER_ADDOUTPUT(partitionNum);
3984         TRACER_ADDSHORTOUTPUT(segmentNum);
3985         TRACER_ADDOUTPUT(status);
3986         TRACER_WRITE;
3987     }
3988 
3989 #endif
3990 
3991     uint32_t lastExtent = 0;
3992     int  lastExtentIndex = -1;
3993     partitionNum = 0;
3994     segmentNum   = 0;
3995     HWM_t hwm    = 0;
3996     bFound       = false;
3997 
3998     if (OID < 0)
3999     {
4000         ostringstream oss;
4001         oss << "ExtentMap::getLastHWM_DBroot(): invalid OID requested: " << OID;
4002         log(oss.str(), logging::LOG_TYPE_CRITICAL);
4003         throw invalid_argument(oss.str());
4004     }
4005 
4006     grabEMEntryTable(READ);
4007 
4008     // Searching the array in reverse order should be faster since the last
4009     // extent is usually at the bottom.  We still have to search the entire
4010     // array (just in case), but the number of operations per loop iteration
4011     // will be less.
4012     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4013 
4014     for (int i = emEntries - 1; i >= 0; i--)
4015     {
4016         if ((fExtentMap[i].range.size != 0)   &&
4017                 (fExtentMap[i].fileID     == OID) &&
4018                 (fExtentMap[i].dbRoot     == dbRoot) &&
4019                 ((fExtentMap[i].status    == EXTENTAVAILABLE) ||
4020                  (fExtentMap[i].status    == EXTENTOUTOFSERVICE)))
4021         {
4022             if ( (fExtentMap[i].partitionNum >  partitionNum) ||
4023                     ((fExtentMap[i].partitionNum == partitionNum) &&
4024                      (fExtentMap[i].blockOffset  >  lastExtent))  ||
4025                     ((fExtentMap[i].partitionNum == partitionNum) &&
4026                      (fExtentMap[i].blockOffset  == lastExtent) &&
4027                      (fExtentMap[i].segmentNum   >= segmentNum)) )
4028             {
4029                 lastExtent      = fExtentMap[i].blockOffset;
4030                 partitionNum    = fExtentMap[i].partitionNum;
4031                 segmentNum      = fExtentMap[i].segmentNum;
4032                 lastExtentIndex = i;
4033             }
4034         }
4035     }
4036 
4037     // save additional information before we release the read-lock
4038     if (lastExtentIndex != -1)
4039     {
4040         hwm    = fExtentMap[lastExtentIndex].HWM;
4041         status = fExtentMap[lastExtentIndex].status;
4042         bFound = true;
4043     }
4044 
4045     releaseEMEntryTable(READ);
4046 
4047     return hwm;
4048 }
4049 
4050 //------------------------------------------------------------------------------
4051 // For the specified OID and PM number, this function will return a vector
4052 // of objects carrying HWM info (for the last segment file) and block count
4053 // information about each DBRoot assigned to the specified PM.
4054 //------------------------------------------------------------------------------
getDbRootHWMInfo(int OID,uint16_t pmNumber,EmDbRootHWMInfo_v & emDbRootHwmInfos)4055 void ExtentMap::getDbRootHWMInfo(int OID, uint16_t pmNumber,
4056                                  EmDbRootHWMInfo_v& emDbRootHwmInfos)
4057 {
4058 #ifdef BRM_INFO
4059 
4060     if (fDebug)
4061     {
4062         TRACER_WRITELATER("getDbRootHWMInfo");
4063         TRACER_ADDINPUT(OID);
4064         TRACER_ADDSHORTINPUT(pmNumber);
4065         TRACER_WRITE;
4066     }
4067 
4068 #endif
4069 
4070     if (OID < 0)
4071     {
4072         ostringstream oss;
4073         oss << "ExtentMap::getDbRootHWMInfo(): invalid OID requested: " << OID;
4074         log(oss.str(), logging::LOG_TYPE_CRITICAL);
4075         throw invalid_argument(oss.str());
4076     }
4077 
4078     // Determine List of DBRoots for specified PM, and construct map of
4079     // EmDbRootHWMInfo objects.
4080     tr1::unordered_map<uint16_t, EmDbRootHWMInfo> emDbRootMap;
4081     vector<int> dbRootList;
4082     getPmDbRoots( pmNumber, dbRootList );
4083 
4084     if ( dbRootList.size() > 0 )
4085     {
4086         for (unsigned int iroot = 0; iroot < dbRootList.size(); iroot++)
4087         {
4088             uint16_t rootID = dbRootList[iroot];
4089             EmDbRootHWMInfo emDbRootInfo(rootID);
4090             emDbRootMap[rootID] = emDbRootInfo;
4091         }
4092     }
4093     else
4094     {
4095         ostringstream oss;
4096         oss << "ExtentMap::getDbRootHWMInfo(): "
4097             "There are no DBRoots for OID " << OID <<
4098             " and PM " << pmNumber << endl;
4099         log(oss.str(), logging::LOG_TYPE_CRITICAL);
4100         throw invalid_argument(oss.str());
4101     }
4102 
4103     grabEMEntryTable(READ);
4104     tr1::unordered_map<uint16_t, EmDbRootHWMInfo>::iterator emIter;
4105 
4106     // Searching the array in reverse order should be faster since the last
4107     // extent is usually at the bottom.  We still have to search the entire
4108     // array (just in case), but the number of operations per loop iteration
4109     // will be less.
4110     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4111 
4112     for (int i = emEntries - 1; i >= 0; i--)
4113     {
4114         if ((fExtentMap[i].range.size != 0)   &&
4115                 (fExtentMap[i].fileID     == OID))
4116         {
4117 
4118             // Include this extent in the search, only if the extent's
4119             // DBRoot falls in the list of DBRoots for this PM.
4120             emIter = emDbRootMap.find( fExtentMap[i].dbRoot );
4121 
4122             if (emIter == emDbRootMap.end())
4123                 continue;
4124 
4125             EmDbRootHWMInfo& emDbRoot = emIter->second;
4126 
4127             if ((fExtentMap[i].status != EXTENTOUTOFSERVICE) &&
4128                     (fExtentMap[i].HWM != 0))
4129                 emDbRoot.totalBlocks += (fExtentMap[i].HWM + 1);
4130 
4131             if ( (fExtentMap[i].partitionNum >  emDbRoot.partitionNum) ||
4132                     ((fExtentMap[i].partitionNum == emDbRoot.partitionNum) &&
4133                      (fExtentMap[i].blockOffset   >  emDbRoot.fbo))         ||
4134                     ((fExtentMap[i].partitionNum == emDbRoot.partitionNum) &&
4135                      (fExtentMap[i].blockOffset   == emDbRoot.fbo) &&
4136                      (fExtentMap[i].segmentNum    >= emDbRoot.segmentNum)) )
4137             {
4138                 emDbRoot.fbo              = fExtentMap[i].blockOffset;
4139                 emDbRoot.partitionNum     = fExtentMap[i].partitionNum;
4140                 emDbRoot.segmentNum       = fExtentMap[i].segmentNum;
4141                 emDbRoot.localHWM         = fExtentMap[i].HWM;
4142                 emDbRoot.startLbid        = fExtentMap[i].range.start;
4143                 emDbRoot.status           = fExtentMap[i].status;
4144                 emDbRoot.hwmExtentIndex   = i;
4145             }
4146         }
4147     }
4148 
4149     releaseEMEntryTable(READ);
4150 
4151     for (tr1::unordered_map<uint16_t, EmDbRootHWMInfo>::iterator iter =
4152                 emDbRootMap.begin(); iter != emDbRootMap.end(); ++iter)
4153     {
4154         EmDbRootHWMInfo& emDbRoot = iter->second;
4155 
4156         if (emDbRoot.hwmExtentIndex != -1)
4157         {
4158             // @bug 5349: make sure HWM extent for each DBRoot is AVAILABLE
4159             if (emDbRoot.status == EXTENTUNAVAILABLE)
4160             {
4161                 ostringstream oss;
4162                 oss << "ExtentMap::getDbRootHWMInfo(): " <<
4163                     "OID " << OID <<
4164                     " has HWM extent that is UNAVAILABLE for " <<
4165                     "DBRoot"      << emDbRoot.dbRoot       <<
4166                     "; part#: "   << emDbRoot.partitionNum <<
4167                     ", seg#: "    << emDbRoot.segmentNum   <<
4168                     ", fbo: "     << emDbRoot.fbo          <<
4169                     ", localHWM: " << emDbRoot.localHWM     <<
4170                     ", lbid: "    << emDbRoot.startLbid    << endl;
4171                 log(oss.str(), logging::LOG_TYPE_CRITICAL);
4172                 throw runtime_error(oss.str());
4173             }
4174 
4175             // In the loop above we ignored "all" the extents with HWM of 0,
4176             // which is okay most of the time, because each segment file's HWM
4177             // is carried in the last extent only.  BUT if we have a segment
4178             // file with HWM=0, having a single extent and a single block at
4179             // the "end" of the data, we still need to account for this last
4180             // block.  So we increment the block count for this isolated case.
4181             if ((emDbRoot.localHWM == 0) &&
4182                     (emDbRoot.status == EXTENTAVAILABLE))
4183             {
4184                 emDbRoot.totalBlocks++;
4185             }
4186         }
4187     }
4188 
4189     // Copy internal map to the output vector argument
4190     for (tr1::unordered_map<uint16_t, EmDbRootHWMInfo>::iterator iter =
4191                 emDbRootMap.begin(); iter != emDbRootMap.end(); ++iter)
4192     {
4193         emDbRootHwmInfos.push_back( iter->second );
4194     }
4195 }
4196 
4197 //------------------------------------------------------------------------------
4198 // Return the existence (bFound) and state (status) for the segment file
4199 // containing the extents for the specified OID, partition, and segment.
4200 // If no extents are found, no exception is thrown.  We instead just return
4201 // bFound=false, so that the application can take the necessary action.
4202 // The value returned in the "status" variable is based on the first extent
4203 // found, since all the extents in a segment file should have the same state.
4204 //------------------------------------------------------------------------------
getExtentState(int OID,uint32_t partitionNum,uint16_t segmentNum,bool & bFound,int & status)4205 void ExtentMap::getExtentState(int OID, uint32_t partitionNum,
4206                                uint16_t segmentNum, bool& bFound, int& status)
4207 {
4208 #ifdef BRM_INFO
4209 
4210     if (fDebug)
4211     {
4212         TRACER_WRITELATER("getExtentState");
4213         TRACER_ADDINPUT(OID);
4214         TRACER_ADDINPUT(partitionNum);
4215         TRACER_ADDSHORTINPUT(segmentNum);
4216         TRACER_ADDOUTPUT(status);
4217         TRACER_WRITE;
4218     }
4219 
4220 #endif
4221     int i, emEntries;
4222     bFound = false;
4223     status = EXTENTAVAILABLE;
4224 
4225     if (OID < 0)
4226     {
4227         ostringstream oss;
4228         oss << "ExtentMap::getExtentState(): invalid OID requested: " << OID;
4229         log(oss.str(), logging::LOG_TYPE_CRITICAL);
4230         throw invalid_argument(oss.str());
4231     }
4232 
4233     grabEMEntryTable(READ);
4234 
4235     emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4236 
4237     for (i = 0; i < emEntries; i++)
4238     {
4239         if ((fExtentMap[i].range.size  != 0) &&
4240                 (fExtentMap[i].fileID      == OID) &&
4241                 (fExtentMap[i].partitionNum == partitionNum) &&
4242                 (fExtentMap[i].segmentNum  == segmentNum))
4243         {
4244             bFound = true;
4245             status = fExtentMap[i].status;
4246             break;
4247         }
4248     }
4249 
4250     releaseEMEntryTable(READ);
4251 }
4252 
4253 //------------------------------------------------------------------------------
4254 // Returns the HWM for the specified OID, partition, and segment numbers.
4255 // Used to get the HWM for a specific column or dictionary store segment file.
4256 //------------------------------------------------------------------------------
getLocalHWM(int OID,uint32_t partitionNum,uint16_t segmentNum,int & status)4257 HWM_t ExtentMap::getLocalHWM(int OID, uint32_t partitionNum,
4258                              uint16_t segmentNum, int& status)
4259 {
4260 #ifdef BRM_INFO
4261 
4262     if (fDebug)
4263     {
4264         TRACER_WRITELATER("getLocalHWM");
4265         TRACER_ADDINPUT(OID);
4266         TRACER_ADDINPUT(partitionNum);
4267         TRACER_ADDSHORTINPUT(segmentNum);
4268         TRACER_ADDOUTPUT(status);
4269         TRACER_WRITE;
4270     }
4271 
4272 #endif
4273 
4274 #ifdef EM_AS_A_TABLE_POC__
4275 
4276     if (OID == 1084)
4277     {
4278         return 0;
4279     }
4280 
4281 #endif
4282 
4283     int i, emEntries;
4284     HWM_t ret = 0;
4285     bool OIDPartSegExists = false;
4286 
4287     if (OID < 0)
4288     {
4289         ostringstream oss;
4290         oss << "ExtentMap::getLocalHWM(): invalid OID requested: " << OID;
4291         log(oss.str(), logging::LOG_TYPE_CRITICAL);
4292         throw invalid_argument(oss.str());
4293     }
4294 
4295     grabEMEntryTable(READ);
4296 
4297     emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4298 
4299     for (i = 0; i < emEntries; i++)
4300     {
4301         if ((fExtentMap[i].range.size  != 0) &&
4302                 (fExtentMap[i].fileID      == OID) &&
4303                 (fExtentMap[i].partitionNum == partitionNum) &&
4304                 (fExtentMap[i].segmentNum  == segmentNum))
4305         {
4306             OIDPartSegExists = true;
4307             status = fExtentMap[i].status;
4308 
4309             if (fExtentMap[i].HWM != 0)
4310             {
4311                 ret = fExtentMap[i].HWM;
4312                 releaseEMEntryTable(READ);
4313                 return ret;
4314             }
4315         }
4316     }
4317 
4318     releaseEMEntryTable(READ);
4319 
4320     if (OIDPartSegExists)
4321         return 0;
4322     else
4323     {
4324         ostringstream oss;
4325         oss << "ExtentMap::getLocalHWM(): There are no extent entries for OID " <<
4326             OID << "; partition " << partitionNum << "; segment " <<
4327             segmentNum << endl;
4328         log(oss.str(), logging::LOG_TYPE_CRITICAL);
4329         throw invalid_argument(oss.str());
4330     }
4331 }
4332 
4333 //------------------------------------------------------------------------------
4334 // Sets the HWM for the specified OID, partition, and segment number.
4335 // In addition, the HWM for the old HWM extent (for this segment file),
4336 // is set to 0, so that the latest HWM is only carried in the last extent
4337 // (per segment file).
4338 // Used for dictionary or column OIDs to set the HWM for specific segment file.
4339 //------------------------------------------------------------------------------
setLocalHWM(int OID,uint32_t partitionNum,uint16_t segmentNum,HWM_t newHWM,bool firstNode,bool uselock)4340 void ExtentMap::setLocalHWM(int OID, uint32_t partitionNum,
4341                             uint16_t segmentNum, HWM_t newHWM, bool firstNode, bool uselock)
4342 {
4343 #ifdef BRM_INFO
4344 
4345     if (fDebug)
4346     {
4347         TRACER_WRITELATER("setLocalHWM");
4348         TRACER_ADDINPUT(OID);
4349         TRACER_ADDINPUT(partitionNum);
4350         TRACER_ADDSHORTINPUT(segmentNum);
4351         TRACER_ADDINPUT(newHWM);
4352         TRACER_WRITE;
4353     }
4354 
4355     bool addedAnExtent = false;
4356 
4357     if (OID < 0)
4358     {
4359         log("ExtentMap::setLocalHWM(): OID must be >= 0",
4360             logging::LOG_TYPE_DEBUG);
4361         throw invalid_argument(
4362             "ExtentMap::setLocalHWM(): OID must be >= 0");
4363     }
4364 
4365 #endif
4366 
4367     int lastExtentIndex     = -1;
4368     int oldHWMExtentIndex   = -1;
4369     uint32_t highestOffset = 0;
4370 
4371     if (uselock)
4372         grabEMEntryTable(WRITE);
4373 
4374     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4375 
4376     for (int i = 0; i < emEntries; i++)
4377     {
4378         if ((fExtentMap[i].range.size  != 0) &&
4379                 (fExtentMap[i].fileID      == OID) &&
4380                 (fExtentMap[i].partitionNum == partitionNum) &&
4381                 (fExtentMap[i].segmentNum  == segmentNum))
4382         {
4383 
4384             // Find current HWM extent
4385             if (fExtentMap[i].blockOffset >= highestOffset)
4386             {
4387                 highestOffset   = fExtentMap[i].blockOffset;
4388                 lastExtentIndex = i;
4389             }
4390 
4391             // Find previous HWM extent
4392             if (fExtentMap[i].HWM != 0)
4393             {
4394                 oldHWMExtentIndex = i;
4395             }
4396         }
4397     }
4398 
4399     if (lastExtentIndex == -1)
4400     {
4401         ostringstream oss;
4402         oss << "ExtentMap::setLocalHWM(): Bad OID/partition/segment argument; "
4403             "no extent entries for OID " << OID << "; partition " <<
4404             partitionNum << "; segment " << segmentNum << endl;
4405         log(oss.str(), logging::LOG_TYPE_CRITICAL);
4406         throw invalid_argument(oss.str());
4407     }
4408 
4409     if (newHWM >= (fExtentMap[lastExtentIndex].blockOffset +
4410                    fExtentMap[lastExtentIndex].range.size * 1024))
4411     {
4412         ostringstream oss;
4413         oss << "ExtentMap::setLocalHWM(): "
4414             "new HWM is past the end of the file for OID " << OID << "; partition " <<
4415             partitionNum << "; segment " << segmentNum << "; HWM " << newHWM;
4416         log(oss.str(), logging::LOG_TYPE_DEBUG);
4417         throw invalid_argument(oss.str());
4418     }
4419 
4420     // Save HWM in last extent for this segment file; and mark as AVAILABLE
4421     makeUndoRecord(&fExtentMap[lastExtentIndex], sizeof(EMEntry));
4422     fExtentMap[lastExtentIndex].HWM    = newHWM;
4423     fExtentMap[lastExtentIndex].status = EXTENTAVAILABLE;
4424 
4425     // Reset HWM in old HWM extent to 0
4426     if ((oldHWMExtentIndex != -1) && (oldHWMExtentIndex != lastExtentIndex))
4427     {
4428         makeUndoRecord(&fExtentMap[oldHWMExtentIndex], sizeof(EMEntry));
4429         fExtentMap[oldHWMExtentIndex].HWM = 0;
4430 #ifdef BRM_INFO
4431         addedAnExtent = true;
4432 #endif
4433     }
4434 
4435 #ifdef BRM_INFO
4436 
4437     if (firstNode)
4438     {
4439         ostringstream os;
4440         os << "ExtentMap::setLocalHWM(): firstLBID=" << fExtentMap[lastExtentIndex].range.start <<
4441            " lastLBID=" << fExtentMap[lastExtentIndex].range.start +
4442            fExtentMap[lastExtentIndex].range.size * 1024 - 1 << " newHWM=" << fExtentMap[lastExtentIndex].HWM
4443            << " min=" << fExtentMap[lastExtentIndex].partition.cprange.lo_val << " max=" <<
4444            fExtentMap[lastExtentIndex].partition.cprange.hi_val << " seq=" <<
4445            fExtentMap[lastExtentIndex].partition.cprange.sequenceNum << " status=";
4446 
4447         switch (fExtentMap[lastExtentIndex].partition.cprange.isValid)
4448         {
4449             case CP_INVALID:
4450                 os << "invalid.";
4451                 break;
4452 
4453             case CP_UPDATING:
4454                 os << "updating.";
4455                 break;
4456 
4457             case CP_VALID:
4458                 os << "valid.";
4459                 break;
4460 
4461             default:
4462                 os << "unknown(!!).";
4463                 break;
4464         }
4465 
4466         if (addedAnExtent)
4467             os << "  Data extended into a new extent.";
4468 
4469         log(os.str(), logging::LOG_TYPE_DEBUG);
4470     }
4471 
4472 #endif
4473 }
4474 
bulkSetHWM(const vector<BulkSetHWMArg> & v,bool firstNode)4475 void ExtentMap::bulkSetHWM(const vector<BulkSetHWMArg>& v, bool firstNode)
4476 {
4477     grabEMEntryTable(WRITE);
4478 
4479     for (uint32_t i = 0; i < v.size(); i++)
4480         setLocalHWM(v[i].oid, v[i].partNum, v[i].segNum, v[i].hwm, firstNode, false);
4481 }
4482 
4483 class BUHasher
4484 {
4485 public:
operator ()(const BulkUpdateDBRootArg & b) const4486     inline uint64_t operator()(const BulkUpdateDBRootArg& b) const
4487     {
4488         return b.startLBID;
4489     }
4490 };
4491 
4492 class BUEqual
4493 {
4494 public:
operator ()(const BulkUpdateDBRootArg & b1,const BulkUpdateDBRootArg & b2) const4495     inline bool operator()(const BulkUpdateDBRootArg& b1, const BulkUpdateDBRootArg& b2) const
4496     {
4497         return b1.startLBID == b2.startLBID;
4498     }
4499 };
4500 
bulkUpdateDBRoot(const vector<BulkUpdateDBRootArg> & args)4501 void ExtentMap::bulkUpdateDBRoot(const vector<BulkUpdateDBRootArg>& args)
4502 {
4503     tr1::unordered_set<BulkUpdateDBRootArg, BUHasher, BUEqual> sArgs;
4504     tr1::unordered_set<BulkUpdateDBRootArg, BUHasher, BUEqual>::iterator sit;
4505     BulkUpdateDBRootArg key;
4506     int emEntries;
4507 
4508     for (uint32_t i = 0; i < args.size(); i++)
4509         sArgs.insert(args[i]);
4510 
4511     grabEMEntryTable(WRITE);
4512 
4513     emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4514 
4515     for (int i = 0; i < emEntries; i++)
4516     {
4517         key.startLBID = fExtentMap[i].range.start;
4518         sit = sArgs.find(key);
4519 
4520         if (sit != sArgs.end())
4521             fExtentMap[i].dbRoot = sit->dbRoot;
4522     }
4523 }
4524 
getExtents(int OID,vector<struct EMEntry> & entries,bool sorted,bool notFoundErr,bool incOutOfService)4525 void ExtentMap::getExtents(int OID, vector<struct EMEntry>& entries,
4526                            bool sorted, bool notFoundErr, bool incOutOfService)
4527 {
4528 #ifdef BRM_INFO
4529 
4530     if (fDebug)
4531     {
4532         TRACER_WRITELATER("getExtents");
4533         TRACER_ADDINPUT(OID);
4534         TRACER_WRITE;
4535     }
4536 
4537 #endif
4538     int i, emEntries;
4539 
4540     entries.clear();
4541 
4542     if (OID < 0)
4543     {
4544         ostringstream oss;
4545         oss << "ExtentMap::getExtents(): invalid OID requested: " << OID;
4546         log(oss.str(), logging::LOG_TYPE_CRITICAL);
4547         throw invalid_argument(oss.str());
4548     }
4549 
4550     grabEMEntryTable(READ);
4551     emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4552     // Pre-expand entries to stop lots of small allocs
4553     entries.reserve(emEntries);
4554 
4555     if (incOutOfService)
4556     {
4557         for (i = 0 ; i < emEntries; i++)
4558             if ((fExtentMap[i].fileID == OID) &&
4559                     (fExtentMap[i].range.size != 0))
4560                 entries.push_back(fExtentMap[i]);
4561     }
4562     else
4563     {
4564         for (i = 0 ; i < emEntries; i++)
4565             if ((fExtentMap[i].fileID     == OID) &&
4566                     (fExtentMap[i].range.size != 0)   &&
4567                     (fExtentMap[i].status     != EXTENTOUTOFSERVICE))
4568                 entries.push_back(fExtentMap[i]);
4569     }
4570 
4571     releaseEMEntryTable(READ);
4572 
4573     if (sorted)
4574         sort<vector<struct EMEntry>::iterator>(entries.begin(), entries.end());
4575 }
4576 
getExtents_dbroot(int OID,vector<struct EMEntry> & entries,const uint16_t dbroot)4577 void ExtentMap::getExtents_dbroot(int OID, vector<struct EMEntry>& entries, const uint16_t dbroot)
4578 {
4579 #ifdef BRM_INFO
4580 
4581     if (fDebug)
4582     {
4583         TRACER_WRITELATER("getExtents");
4584         TRACER_ADDINPUT(OID);
4585         TRACER_WRITE;
4586     }
4587 
4588 #endif
4589 
4590 #ifdef EM_AS_A_TABLE_POC__
4591 
4592     if (OID == 1084)
4593     {
4594         EMEntry fakeEntry;
4595         fakeEntry.range.start = (1LL << 54);
4596         fakeEntry.range.size = 4;
4597         fakeEntry.fileID = 1084;
4598         fakeEntry.blockOffset = 0;
4599         fakeEntry.HWM = 1;
4600         fakeEntry.partitionNum = 0;
4601         fakeEntry.segmentNum = 0;
4602         fakeEntry.dbRoot = 1;
4603         fakeEntry.colWid = 4;
4604         fakeEntry.status = EXTENTAVAILABLE;
4605         fakeEntry.partition.cprange.hi_val = numeric_limits<int64_t>::min() + 2;
4606         fakeEntry.partition.cprange.lo_val = numeric_limits<int64_t>::max();
4607         fakeEntry.partition.cprange.sequenceNum = 0;
4608         fakeEntry.partition.cprange.isValid = CP_INVALID;
4609         entries.push_back(fakeEntry);
4610         return;
4611     }
4612 
4613 #endif
4614 
4615     int i, emEntries;
4616 
4617     entries.clear();
4618 
4619     if (OID < 0)
4620     {
4621         ostringstream oss;
4622         oss << "ExtentMap::getExtents(): invalid OID requested: " << OID;
4623         log(oss.str(), logging::LOG_TYPE_CRITICAL);
4624         throw invalid_argument(oss.str());
4625     }
4626 
4627     grabEMEntryTable(READ);
4628     emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4629 
4630     for (i = 0 ; i < emEntries; i++)
4631         if ((fExtentMap[i].fileID == OID) &&
4632                 (fExtentMap[i].range.size != 0) && (fExtentMap[i].dbRoot == dbroot))
4633             entries.push_back(fExtentMap[i]);
4634 
4635     releaseEMEntryTable(READ);
4636 }
4637 
4638 //------------------------------------------------------------------------------
4639 // Get the number of extents for the specified OID and DBRoot.
4640 // OutOfService extents are included/excluded depending on the
4641 // value of the incOutOfService flag.
4642 //------------------------------------------------------------------------------
getExtentCount_dbroot(int OID,uint16_t dbroot,bool incOutOfService,uint64_t & numExtents)4643 void ExtentMap::getExtentCount_dbroot(int OID, uint16_t dbroot,
4644                                       bool incOutOfService, uint64_t& numExtents)
4645 {
4646     int i, emEntries;
4647 
4648     if (OID < 0)
4649     {
4650         ostringstream oss;
4651         oss << "ExtentMap::getExtentsCount_dbroot(): invalid OID requested: " <<
4652             OID;
4653         log(oss.str(), logging::LOG_TYPE_CRITICAL);
4654         throw invalid_argument(oss.str());
4655     }
4656 
4657     grabEMEntryTable(READ);
4658     emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4659 
4660     numExtents = 0;
4661 
4662     if (incOutOfService)
4663     {
4664         for (i = 0 ; i < emEntries; i++)
4665         {
4666             if ((fExtentMap[i].fileID     == OID) &&
4667                     (fExtentMap[i].range.size != 0)   &&
4668                     (fExtentMap[i].dbRoot     == dbroot))
4669                 numExtents++;
4670         }
4671     }
4672     else
4673     {
4674         for (i = 0 ; i < emEntries; i++)
4675         {
4676             if ((fExtentMap[i].fileID     == OID)    &&
4677                     (fExtentMap[i].range.size != 0)      &&
4678                     (fExtentMap[i].dbRoot     == dbroot) &&
4679                     (fExtentMap[i].status     != EXTENTOUTOFSERVICE))
4680                 numExtents++;
4681         }
4682     }
4683 
4684     releaseEMEntryTable(READ);
4685 }
4686 
4687 //------------------------------------------------------------------------------
4688 // Gets the DBRoot for the specified system catalog OID.
4689 // Function assumes the specified System Catalog OID is fully contained on
4690 // a single DBRoot, as the function only searches for and returns the first
4691 // DBRoot entry that is found in the extent map.
4692 //------------------------------------------------------------------------------
getSysCatDBRoot(OID_t oid,uint16_t & dbRoot)4693 void ExtentMap::getSysCatDBRoot(OID_t oid, uint16_t& dbRoot)
4694 {
4695 #ifdef BRM_INFO
4696 
4697     if (fDebug)
4698     {
4699         TRACER_WRITELATER("getSysCatDBRoot");
4700         TRACER_ADDINPUT(oid);
4701         TRACER_ADDSHORTOUTPUT(dbRoot);
4702         TRACER_WRITE;
4703     }
4704 
4705 #endif
4706 
4707     bool bFound = false;
4708     grabEMEntryTable(READ);
4709     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4710 
4711     for (int i = 0 ; i < emEntries; i++)
4712     {
4713         if ((fExtentMap[i].range.size != 0) &&
4714                 (fExtentMap[i].fileID     == oid))
4715         {
4716             dbRoot = fExtentMap[i].dbRoot;
4717             bFound = true;
4718             break;
4719         }
4720     }
4721 
4722     releaseEMEntryTable(READ);
4723 
4724     if (!bFound)
4725     {
4726         ostringstream oss;
4727         oss << "ExtentMap::getSysCatDBRoot(): OID not found: " << oid;
4728         log(oss.str(), logging::LOG_TYPE_WARNING);
4729         throw logic_error(oss.str());
4730     }
4731 }
4732 
4733 //------------------------------------------------------------------------------
4734 // Delete all extents for the specified OID(s) and partition number.
4735 // @bug 5237 - Removed restriction that prevented deletion of segment files in
4736 //             the last partition (for a DBRoot).
4737 //------------------------------------------------------------------------------
deletePartition(const set<OID_t> & oids,const set<LogicalPartition> & partitionNums,string & emsg)4738 void ExtentMap::deletePartition(const set<OID_t>& oids,
4739                                 const set<LogicalPartition>& partitionNums, string& emsg)
4740 {
4741 #ifdef BRM_INFO
4742 
4743     if (fDebug)
4744     {
4745         TRACER_WRITENOW("deletePartition");
4746         ostringstream oss;
4747         set<LogicalPartition>::const_iterator partIt;
4748 		oss << "partitionNums: ";
4749 		for (partIt=partitionNums.begin(); partIt!=partitionNums.end(); ++partIt)
4750 			oss << (*partIt) << " ";
4751 
4752         oss << endl;
4753         oss << "OIDS: ";
4754         set<OID_t>::const_iterator it;
4755 
4756         for (it = oids.begin(); it != oids.end(); ++it)
4757         {
4758             oss << (*it) << ", ";
4759         }
4760 
4761         TRACER_WRITEDIRECT(oss.str());
4762     }
4763 
4764 #endif
4765 
4766     if (oids.size() == 0)
4767         return;
4768 
4769     int rc = 0;
4770 
4771     grabEMEntryTable(WRITE);
4772     grabFreeList(WRITE);
4773     set<LogicalPartition> foundPartitions;
4774     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4775     vector<uint32_t> extents;
4776 
4777     // First: validate against referencing non-existent logical partitions
4778     std::set<OID_t>::const_iterator it;
4779 
4780     for (int i = 0; i < emEntries; i++)
4781     {
4782         LogicalPartition lp(fExtentMap[i].dbRoot,
4783                             fExtentMap[i].partitionNum, fExtentMap[i].segmentNum);
4784 
4785         if ((fExtentMap[i].range.size != 0) &&
4786                 (partitionNums.find(lp)   != partitionNums.end()))
4787         {
4788             it = oids.find( fExtentMap[i].fileID );
4789 
4790             if (it != oids.end())
4791             {
4792                 foundPartitions.insert(lp);
4793                 extents.push_back(i);
4794             }
4795         }
4796     }
4797 
4798     if (foundPartitions.size() != partitionNums.size())
4799     {
4800         set<LogicalPartition>::const_iterator partIt;
4801         Message::Args args;
4802         ostringstream oss;
4803 
4804         for (partIt = partitionNums.begin();
4805                 partIt != partitionNums.end(); ++partIt)
4806         {
4807             if (foundPartitions.find((*partIt)) == foundPartitions.end())
4808             {
4809                 if (!oss.str().empty())
4810                     oss << ", ";
4811 
4812                 oss << (*partIt).toString();
4813             }
4814         }
4815 
4816         args.add(oss.str());
4817         emsg = IDBErrorInfo::instance()->errorMsg(ERR_PARTITION_NOT_EXIST, args);
4818         rc = ERR_PARTITION_NOT_EXIST;
4819     }
4820 
4821     // this has to be the last error code to set and can not be over-written
4822     if (foundPartitions.empty())
4823         rc = WARN_NO_PARTITION_PERFORMED;
4824 
4825     // really delete extents
4826     for (uint32_t i = 0; i < extents.size(); i++)
4827     {
4828         deleteExtent(extents[i]);
4829     }
4830 
4831     // @bug 4772 throw exception on any error because they are all warnings.
4832     if (rc)
4833         throw IDBExcept(emsg, rc);
4834 }
4835 
4836 //------------------------------------------------------------------------------
4837 // Mark all extents as out of service, for the specified OID(s) and partition
4838 // number.
4839 // @bug 5237 - Removed restriction that prevented deletion of segment files in
4840 //             the last partition (for a DBRoot).
4841 //------------------------------------------------------------------------------
markPartitionForDeletion(const set<OID_t> & oids,const set<LogicalPartition> & partitionNums,string & emsg)4842 void ExtentMap::markPartitionForDeletion(const set<OID_t>& oids,
4843         const set<LogicalPartition>& partitionNums, string& emsg)
4844 {
4845 #ifdef BRM_INFO
4846 
4847     if (fDebug)
4848     {
4849         TRACER_WRITENOW("markPartitionForDeletion");
4850         ostringstream oss;
4851         set<LogicalPartition>::const_iterator partIt;
4852 		oss << "partitionNums: ";
4853 		for (partIt=partitionNums.begin(); partIt!=partitionNums.end(); ++partIt)
4854 			oss << (*partIt) << " ";
4855 
4856         oss << endl;
4857         oss << "OIDS: ";
4858         set<OID_t>::const_iterator it;
4859 
4860         for (it = oids.begin(); it != oids.end(); ++it)
4861         {
4862             oss << (*it) << ", ";
4863         }
4864 
4865         TRACER_WRITEDIRECT(oss.str());
4866     }
4867 
4868 #endif
4869 
4870     if (oids.size() == 0)
4871         return;
4872 
4873     int rc = 0;
4874 
4875     grabEMEntryTable(WRITE);
4876     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4877     set<LogicalPartition> foundPartitions;
4878     vector<uint32_t> extents;
4879     bool partitionAlreadyDisabled = false;
4880 
4881     // Identify not exists partition first. Then mark disable.
4882     std::set<OID_t>::const_iterator it;
4883 
4884     for (int i = 0; i < emEntries; i++)
4885     {
4886         LogicalPartition lp(fExtentMap[i].dbRoot,
4887                             fExtentMap[i].partitionNum, fExtentMap[i].segmentNum);
4888 
4889         if ((fExtentMap[i].range.size != 0) &&
4890                 (partitionNums.find(lp)   != partitionNums.end()))
4891         {
4892             it = oids.find( fExtentMap[i].fileID );
4893 
4894             if (it != oids.end())
4895             {
4896                 if (fExtentMap[i].status == EXTENTOUTOFSERVICE)
4897                 {
4898                     partitionAlreadyDisabled = true;
4899                 }
4900 
4901                 foundPartitions.insert(lp);
4902                 extents.push_back(i);
4903             }
4904         }
4905     }
4906 
4907     // really disable partitions
4908     for (uint32_t i = 0; i < extents.size(); i++)
4909     {
4910         makeUndoRecord(&fExtentMap[extents[i]], sizeof(EMEntry));
4911         fExtentMap[extents[i]].status = EXTENTOUTOFSERVICE;
4912     }
4913 
4914     // validate against referencing non-existent logical partitions
4915     if (foundPartitions.size() != partitionNums.size())
4916     {
4917         set<LogicalPartition>::const_iterator partIt;
4918         Message::Args args;
4919         ostringstream oss;
4920 
4921         for (partIt = partitionNums.begin();
4922                 partIt != partitionNums.end(); ++partIt)
4923         {
4924             if (foundPartitions.find((*partIt)) == foundPartitions.end())
4925             {
4926                 if (!oss.str().empty())
4927                     oss << ", ";
4928 
4929                 oss << (*partIt).toString();
4930             }
4931         }
4932 
4933         args.add(oss.str());
4934         emsg = emsg + string("\n") + IDBErrorInfo::instance()->errorMsg(
4935                    ERR_PARTITION_NOT_EXIST, args);
4936         rc = ERR_PARTITION_NOT_EXIST;
4937     }
4938 
4939     // check already disabled error now, which could be a non-error
4940     if (partitionAlreadyDisabled)
4941     {
4942         emsg = emsg + string("\n") + IDBErrorInfo::instance()->errorMsg(
4943                    ERR_PARTITION_ALREADY_DISABLED);
4944         rc = ERR_PARTITION_ALREADY_DISABLED;
4945     }
4946 
4947     // this rc has to be the last one set and can not be over-written by others.
4948     if (foundPartitions.empty())
4949     {
4950         rc = WARN_NO_PARTITION_PERFORMED;
4951     }
4952 
4953     // @bug 4772 throw exception on any error because they are all warnings.
4954     if (rc)
4955         throw IDBExcept(emsg, rc);
4956 }
4957 
4958 //------------------------------------------------------------------------------
4959 // Mark all extents as out of service, for the specified OID(s)
4960 //------------------------------------------------------------------------------
markAllPartitionForDeletion(const set<OID_t> & oids)4961 void ExtentMap::markAllPartitionForDeletion(const set<OID_t>& oids)
4962 {
4963 #ifdef BRM_INFO
4964 
4965     if (fDebug)
4966     {
4967         TRACER_WRITENOW("markPartitionForDeletion");
4968         ostringstream oss;
4969         oss << "OIDS: ";
4970         set<OID_t>::const_iterator it;
4971 
4972         for (it = oids.begin(); it != oids.end(); ++it)
4973         {
4974             oss << (*it) << ", ";
4975         }
4976 
4977         TRACER_WRITEDIRECT(oss.str());
4978     }
4979 
4980 #endif
4981 
4982     if (oids.size() == 0)
4983         return;
4984 
4985     set<OID_t>::const_iterator it;
4986 
4987     grabEMEntryTable(WRITE);
4988     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
4989 
4990     for (int i = 0; i < emEntries; i++)
4991     {
4992         if (fExtentMap[i].range.size  != 0  )
4993         {
4994             it = oids.find( fExtentMap[i].fileID );
4995 
4996             if (it != oids.end())
4997             {
4998                 makeUndoRecord(&fExtentMap[i], sizeof(EMEntry));
4999                 fExtentMap[i].status = EXTENTOUTOFSERVICE;
5000             }
5001         }
5002     }
5003 }
5004 
5005 //------------------------------------------------------------------------------
5006 // Restore all extents for the specified OID(s) and partition number.
5007 //------------------------------------------------------------------------------
restorePartition(const set<OID_t> & oids,const set<LogicalPartition> & partitionNums,string & emsg)5008 void ExtentMap::restorePartition(const set<OID_t>& oids,
5009                                  const set<LogicalPartition>& partitionNums, string& emsg)
5010 {
5011 #ifdef BRM_INFO
5012 
5013     if (fDebug)
5014     {
5015         TRACER_WRITENOW("restorePartition");
5016         ostringstream oss;
5017         set<LogicalPartition>::const_iterator partIt;
5018 		oss << "partitionNums: ";
5019 		for (partIt=partitionNums.begin(); partIt!=partitionNums.end(); ++partIt)
5020 			oss << (*partIt) << " ";
5021 
5022         oss << endl;
5023         oss << "OIDS: ";
5024         set<OID_t>::const_iterator it;
5025 
5026         for (it = oids.begin(); it != oids.end(); ++it)
5027         {
5028             oss << (*it) << ", ";
5029         }
5030 
5031         TRACER_WRITEDIRECT(oss.str());
5032     }
5033 
5034 #endif
5035 
5036     if (oids.size() == 0)
5037         return;
5038 
5039     set<OID_t>::const_iterator it;
5040     grabEMEntryTable(WRITE);
5041 
5042     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
5043     vector<uint32_t> extents;
5044     set<LogicalPartition> foundPartitions;
5045     bool partitionAlreadyEnabled = false;
5046 
5047     for (int i = 0; i < emEntries; i++)
5048     {
5049         LogicalPartition lp(fExtentMap[i].dbRoot, fExtentMap[i].partitionNum, fExtentMap[i].segmentNum);
5050 
5051         if ((fExtentMap[i].range.size  != 0  ) && partitionNums.find(lp) != partitionNums.end())
5052         {
5053             it = oids.find( fExtentMap[i].fileID );
5054 
5055             if (it != oids.end())
5056             {
5057                 if (fExtentMap[i].status == EXTENTAVAILABLE)
5058                 {
5059                     partitionAlreadyEnabled = true;
5060                 }
5061 
5062                 extents.push_back(i);
5063                 foundPartitions.insert(lp);
5064             }
5065         }
5066     }
5067 
5068     if (foundPartitions.size() != partitionNums.size())
5069     {
5070         set<LogicalPartition>::const_iterator partIt;
5071         Message::Args args;
5072         ostringstream oss;
5073 
5074         for (partIt = partitionNums.begin(); partIt != partitionNums.end(); ++partIt)
5075         {
5076             if (foundPartitions.empty() || foundPartitions.find((*partIt)) == foundPartitions.end())
5077             {
5078                 if (!oss.str().empty())
5079                     oss << ", ";
5080 
5081                 oss << (*partIt).toString();
5082             }
5083         }
5084 
5085         args.add(oss.str());
5086         emsg = IDBErrorInfo::instance()->errorMsg(ERR_PARTITION_NOT_EXIST, args);
5087         throw IDBExcept(emsg, ERR_PARTITION_NOT_EXIST);
5088     }
5089 
5090     // really enable partitions
5091     for (uint32_t i = 0; i < extents.size(); i++)
5092     {
5093         makeUndoRecord(&fExtentMap[extents[i]], sizeof(EMEntry));
5094         fExtentMap[extents[i]].status = EXTENTAVAILABLE;
5095     }
5096 
5097     if (partitionAlreadyEnabled)
5098     {
5099         emsg = IDBErrorInfo::instance()->errorMsg(ERR_PARTITION_ALREADY_ENABLED);
5100         throw IDBExcept(emsg, ERR_PARTITION_ALREADY_ENABLED);
5101     }
5102 }
5103 
5104 //------------------------------------------------------------------------------
5105 // Return all the out-of-service partitions for the specified OID.
5106 //------------------------------------------------------------------------------
getOutOfServicePartitions(OID_t oid,set<LogicalPartition> & partitionNums)5107 void ExtentMap::getOutOfServicePartitions(OID_t oid,
5108         set<LogicalPartition>& partitionNums)
5109 {
5110 #ifdef BRM_INFO
5111 
5112     if (fDebug)
5113     {
5114         TRACER_WRITELATER("getExtents");
5115         TRACER_ADDINPUT(oid);
5116         TRACER_WRITE;
5117     }
5118 
5119 #endif
5120 
5121     partitionNums.clear();
5122 
5123     if (oid < 0)
5124     {
5125         ostringstream oss;
5126         oss << "ExtentMap::getOutOfServicePartitions(): "
5127             "invalid OID requested: " << oid;
5128         log(oss.str(), logging::LOG_TYPE_CRITICAL);
5129         throw invalid_argument(oss.str());
5130     }
5131 
5132     grabEMEntryTable(READ);
5133     int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
5134 
5135     for (int i = 0; i < emEntries; i++)
5136     {
5137         if ((fExtentMap[i].range.size != 0  ) &&
5138                 (fExtentMap[i].fileID     == oid) &&
5139                 (fExtentMap[i].status     == EXTENTOUTOFSERVICE))
5140         {
5141 
5142             // need to be logical partition number
5143             LogicalPartition lp(fExtentMap[i].dbRoot,
5144                                 fExtentMap[i].partitionNum,
5145                                 fExtentMap[i].segmentNum);
5146             partitionNums.insert(lp);
5147         }
5148     }
5149 
5150     releaseEMEntryTable(READ);
5151 }
5152 
5153 //------------------------------------------------------------------------------
5154 // Delete all extents for the specified dbroot
5155 //------------------------------------------------------------------------------
deleteDBRoot(uint16_t dbroot)5156 void ExtentMap::deleteDBRoot(uint16_t dbroot)
5157 {
5158 #ifdef BRM_INFO
5159 
5160     if (fDebug)
5161     {
5162         TRACER_WRITENOW("deleteDBRoot");
5163         ostringstream oss;
5164         oss << "dbroot: " << dbroot;
5165         TRACER_WRITEDIRECT(oss.str());
5166     }
5167 
5168 #endif
5169 
5170     grabEMEntryTable(WRITE);
5171     grabFreeList(WRITE);
5172 
5173     for (unsigned i = 0; i < fEMShminfo->allocdSize / sizeof(struct EMEntry); i++)
5174         if (fExtentMap[i].range.size != 0 && fExtentMap[i].dbRoot == dbroot)
5175             deleteExtent(i);
5176 }
5177 
5178 //------------------------------------------------------------------------------
5179 // Does the specified DBRoot have any extents.
5180 // Throws exception if extentmap shared memory is not loaded.
5181 //------------------------------------------------------------------------------
isDBRootEmpty(uint16_t dbroot)5182 bool ExtentMap::isDBRootEmpty(uint16_t dbroot)
5183 {
5184 #ifdef BRM_INFO
5185 
5186     if (fDebug)
5187     {
5188         TRACER_WRITELATER("isDBRootEmpty");
5189 		TRACER_ADDINPUT(dbroot);
5190         TRACER_WRITE;
5191     }
5192 
5193 #endif
5194 
5195     bool bEmpty = true;
5196     int i, emEntries;
5197     grabEMEntryTable(READ);
5198     emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
5199 
5200     if (fEMShminfo->currentSize == 0)
5201     {
5202         throw runtime_error(
5203             "ExtentMap::isDBRootEmpty() shared memory not loaded");
5204     }
5205 
5206     for (i = 0; i < emEntries; i++)
5207     {
5208         if ((fExtentMap[i].range.size != 0)   &&
5209                 (fExtentMap[i].dbRoot     == dbroot))
5210         {
5211             bEmpty = false;
5212             break;
5213         }
5214     }
5215 
5216     releaseEMEntryTable(READ);
5217 
5218     return bEmpty;
5219 }
5220 
lookup(OID_t OID,LBIDRange_v & ranges)5221 void ExtentMap::lookup(OID_t OID, LBIDRange_v& ranges)
5222 {
5223 #ifdef BRM_INFO
5224 
5225     if (fDebug)
5226     {
5227         TRACER_WRITELATER("lookup");
5228         TRACER_ADDINPUT(OID);
5229         TRACER_WRITE;
5230     }
5231 
5232 #endif
5233 
5234 #ifdef EM_AS_A_TABLE_POC__
5235 
5236     if (OID == 1084)
5237     {
5238         EMEntry fakeEntry;
5239         fakeEntry.range.start = (1LL << 54);
5240         fakeEntry.range.size = 4;
5241 #if 0
5242         fakeEntry.fileID = 1084;
5243         fakeEntry.blockOffset = 0;
5244         fakeEntry.HWM = 1;
5245         fakeEntry.partitionNum = 0;
5246         fakeEntry.segmentNum = 0;
5247         fakeEntry.dbRoot = 1;
5248         fakeEntry.colWid = 4;
5249         fakeEntry.status = EXTENTAVAILABLE;
5250         fakeEntry.partition.cprange.hi_val = numeric_limits<int64_t>::min() + 2;
5251         fakeEntry.partition.cprange.lo_val = numeric_limits<int64_t>::max();
5252         fakeEntry.partition.cprange.sequenceNum = 0;
5253         fakeEntry.partition.cprange.isValid = CP_INVALID;
5254 #endif
5255         ranges.push_back(fakeEntry.range);
5256         return;
5257     }
5258 
5259 #endif
5260 
5261     int i, emEntries;
5262     LBIDRange tmp;
5263 
5264     ranges.clear();
5265 
5266     if (OID < 0)
5267     {
5268         ostringstream oss;
5269         oss << "ExtentMap::lookup(): invalid OID requested: " << OID;
5270         log(oss.str(), logging::LOG_TYPE_CRITICAL);
5271         throw invalid_argument(oss.str());
5272     }
5273 
5274     grabEMEntryTable(READ);
5275     emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
5276 
5277     for (i = 0 ; i < emEntries; i++)
5278         if ((fExtentMap[i].fileID     == OID) &&
5279                 (fExtentMap[i].range.size != 0) &&
5280                 (fExtentMap[i].status     != EXTENTOUTOFSERVICE))
5281         {
5282             tmp.start = fExtentMap[i].range.start;
5283             tmp.size = fExtentMap[i].range.size * 1024;
5284             ranges.push_back(tmp);
5285         }
5286 
5287     releaseEMEntryTable(READ);
5288 }
5289 
5290 
checkConsistency()5291 int ExtentMap::checkConsistency()
5292 {
5293 #ifdef BRM_INFO
5294 
5295     if (fDebug) TRACER_WRITENOW("checkConsistency");
5296 
5297 #endif
5298 
5299     /*
5300      LBID space consistency checks
5301     	1. verify that every LBID is either in the EM xor the freelist
5302     		a. for every segment in the EM, make sure there is no overlapping entry in the FL
5303     		b. scan both lists to verify that the entire space is represented
5304     	2. verify that there are no adjacent entries in the freelist
5305      OID consistency
5306     	3. make sure there are no gaps in the file offsets
5307     	4. make sure that only the last extent has a non-zero HWM
5308      Struct integrity
5309     	5. verify that the number of entries in each table is consistent with
5310     		the recorded current size
5311     */
5312 
5313     LBID_t emBegin, emEnd, flBegin, flEnd;
5314     int i, j, flEntries, emEntries;
5315     uint32_t usedEntries;
5316 
5317     grabEMEntryTable(READ);
5318 
5319     try
5320     {
5321         grabFreeList(READ);
5322     }
5323     catch (...)
5324     {
5325         releaseEMEntryTable(READ);
5326         throw;
5327     }
5328 
5329     flEntries = fFLShminfo->allocdSize / sizeof(InlineLBIDRange);
5330     emEntries = fEMShminfo->allocdSize / sizeof(EMEntry);
5331 
5332     // test 1a - make sure every entry in the EM is not overlapped by an entry in the FL
5333     for (i = 0; i < emEntries; i++)
5334     {
5335         if (fExtentMap[i].range.size != 0)
5336         {
5337             emBegin = fExtentMap[i].range.start;
5338             emEnd = emBegin + (fExtentMap[i].range.size * 1024) - 1;
5339 
5340             for (j = 0; j < flEntries; j++)
5341             {
5342                 if (fFreeList[j].size != 0)
5343                 {
5344                     flBegin = fFreeList[j].start;
5345                     flEnd = flBegin + (fFreeList[j].size * 1024) - 1;
5346 
5347                     //em entry overlaps the beginning
5348                     //em entry is contained within
5349                     //em entry overlaps the end
5350                     if ((emBegin <= flBegin && emEnd >= flBegin) ||
5351                             (emBegin >= flBegin && emEnd <= flEnd) ||
5352                             (emBegin <= flEnd && emEnd >= flEnd))
5353                     {
5354                         cerr << "EM::checkConsistency(): Improper LBID allocation detected" << endl;
5355                         throw logic_error("EM checkConsistency test 1a (data structures are read-locked)");
5356                     }
5357                 }
5358             }
5359         }
5360     }
5361 
5362     cout << "test 1a passed\n";
5363 
5364     //test 1b - verify that the entire LBID space is accounted for
5365 
5366     int lbid, oldlbid;
5367 
5368     lbid = 0;
5369 
5370     while (lbid < 67108864)      // 2^26  (2^36/1024)
5371     {
5372         oldlbid = lbid;
5373 
5374         for (i = 0; i < flEntries; i++)
5375         {
5376             if (fFreeList[i].start % 1024 != 0)
5377             {
5378                 cerr << "EM::checkConsistency(): A freelist entry is not 1024-block aligned" << endl;
5379                 throw logic_error("EM checkConsistency test 1b (data structures are read-locked)");
5380             }
5381 
5382             if (fFreeList[i].start / 1024 == lbid)
5383                 lbid += fFreeList[i].size;
5384         }
5385 
5386         for (i = 0; i < emEntries; i++)
5387         {
5388             if (fExtentMap[i].range.start % 1024 != 0)
5389             {
5390                 cerr << "EM::checkConsistency(): An extent map entry is not 1024-block aligned " << i << " " << fExtentMap[i].range.start <<  endl;
5391                 throw logic_error("EM checkConsistency test 1b (data structures are read-locked)");
5392             }
5393 
5394             if (fExtentMap[i].range.start / 1024 == lbid)
5395                 lbid += fExtentMap[i].range.size;
5396         }
5397 
5398         if (oldlbid == lbid)
5399         {
5400             cerr << "EM::checkConsistency(): There is a gap in the LBID space at block #" <<
5401                  static_cast<uint64_t>(lbid * 1024) << endl;
5402             throw logic_error("EM checkConsistency test 1b (data structures are read-locked)");
5403         }
5404     }
5405 
5406     cout << "test 1b passed\n";
5407 
5408     // test 1c - verify that no dbroot is < 1
5409     bool errorOut = false;
5410 
5411     for (i = 0; i < emEntries; i++)
5412     {
5413         if (fExtentMap[i].range.size != 0)
5414         {
5415             //cout << "EM[" << i << "]: dbRoot=" << fExtentMap[i].dbRoot(listMan) << endl;
5416             if (fExtentMap[i].dbRoot == 0)
5417             {
5418                 errorOut = true;
5419                 cerr << "EM::checkConsistency(): index " << i << " has a 0 dbroot\n";
5420             }
5421         }
5422     }
5423 
5424     if (errorOut)
5425         throw logic_error("EM checkConsistency test 1c (data structures are read-locked)");
5426 
5427     cout << "test 1c passed\n";
5428 
5429 #if 0  // a test ported from the tek2 branch, which requires a RID field to be stored; not relevant here
5430     // test 1d - verify that each <OID, RID> pair is unique
5431     cout << "Running test 1d\n";
5432 
5433     set<OIDRID> uniquer;
5434 
5435     for (i = 0; i < emEntries; i++)
5436     {
5437         if (fExtentMap[i].size != 0 && !fExtentMap[i].isDict())
5438         {
5439             OIDRID element(fExtentMap[i].fileID, fExtentMap[i].rid);
5440 
5441             if (uniquer.insert(element).second == false)
5442                 throw logic_error("EM consistency test 1d failed (data structures are read-locked)");
5443         }
5444     }
5445 
5446     uniquer.clear();
5447     cout << "Test 1d passed\n";
5448 #endif
5449 
5450     // test 2 - verify that the freelist is consolidated
5451     for (i = 0; i < flEntries; i++)
5452     {
5453         if (fFreeList[i].size != 0)
5454         {
5455             flEnd = fFreeList[i].start + (fFreeList[i].size * 1024);
5456 
5457             for (j = i + 1; j < flEntries; j++)
5458                 if (fFreeList[j].size != 0 && fFreeList[j].start == flEnd)
5459                     throw logic_error("EM checkConsistency test 2 (data structures are read-locked)");
5460         }
5461     }
5462 
5463     cout << "test 2 passed\n";
5464 
5465 // needs to be updated
5466 #if 0
5467     // test 3 - scan the extent map to make sure files have no LBID gaps
5468     vector<OID_t> oids;
5469     vector< vector<uint32_t> > fbos;
5470 
5471     for (i = 0; i < emEntries; i++)
5472     {
5473         if (fExtentMap[i].size != 0)
5474         {
5475             for (j = 0; j < (int)oids.size(); j++)
5476                 if (oids[j] == fExtentMap[i].fileID)
5477                     break;
5478 
5479             if (j == (int)oids.size())
5480             {
5481                 oids.push_back(fExtentMap[i].fileID);
5482                 fbos.push_back(vector<uint32_t>());
5483             }
5484 
5485             fbos[j].push_back(fExtentMap[i].blockOffset);
5486         }
5487     }
5488 
5489     for (i = 0; i < (int)fbos.size(); i++)
5490         sort<vector<uint32_t>::iterator>(fbos[i].begin(), fbos[i].end());
5491 
5492     const unsigned EXTENT_SIZE = getExtentSize();
5493 
5494     for (i = 0; i < (int)fbos.size(); i++)
5495     {
5496         for (j = 0; j < (int)fbos[i].size(); j++)
5497         {
5498             if (fbos[i][j] != static_cast<uint32_t>(j * EXTENT_SIZE))
5499             {
5500                 cerr << "EM: OID " << oids[i] << " has no extent at FBO " <<
5501                      j* EXTENT_SIZE << endl;
5502                 throw logic_error("EM checkConsistency test 3 (data structures are read-locked)");
5503             }
5504         }
5505     }
5506 
5507     fbos.clear();
5508     oids.clear();
5509 #endif
5510 
5511 
5512     // test 5a - scan freelist to make sure the current size is accurate
5513 
5514     for (i = 0, usedEntries = 0; i < emEntries; i++)
5515         if (fExtentMap[i].range.size != 0)
5516             usedEntries++;
5517 
5518     if (usedEntries != fEMShminfo->currentSize / sizeof(EMEntry))
5519     {
5520         cerr << "checkConsistency: used extent map entries = " << usedEntries
5521              << " metadata says " << fEMShminfo->currentSize / sizeof(EMEntry)
5522              << endl;
5523         throw logic_error("EM checkConsistency test 5a (data structures are read-locked)");
5524     }
5525 
5526     for (i = 0, usedEntries = 0; i < flEntries; i++)
5527         if (fFreeList[i].size != 0)
5528             usedEntries++;
5529 
5530     if (usedEntries != fFLShminfo->currentSize / sizeof(InlineLBIDRange))
5531     {
5532         cerr << "checkConsistency: used freelist entries = " << usedEntries
5533              << " metadata says " << fFLShminfo->currentSize / sizeof(InlineLBIDRange)
5534              << endl;
5535         throw logic_error("EM checkConsistency test 5a (data structures are read-locked)");
5536     }
5537 
5538     cout << "test 5a passed\n";
5539 
5540     releaseFreeList(READ);
5541     releaseEMEntryTable(READ);
5542     return 0;
5543 }
5544 
5545 
setReadOnly()5546 void ExtentMap::setReadOnly()
5547 {
5548     r_only = true;
5549 }
5550 
undoChanges()5551 void ExtentMap::undoChanges()
5552 {
5553 #ifdef BRM_INFO
5554 
5555     if (fDebug) TRACER_WRITENOW("undoChanges");
5556 
5557 #endif
5558     Undoable::undoChanges();
5559     finishChanges();
5560 }
5561 
confirmChanges()5562 void ExtentMap::confirmChanges()
5563 {
5564 #ifdef BRM_INFO
5565 
5566     if (fDebug) TRACER_WRITENOW("confirmChanges");
5567 
5568 #endif
5569     Undoable::confirmChanges();
5570     finishChanges();
5571 }
5572 
finishChanges()5573 void ExtentMap::finishChanges()
5574 {
5575     if (flLocked)
5576         releaseFreeList(WRITE);
5577 
5578     if (emLocked)
5579         releaseEMEntryTable(WRITE);
5580 }
5581 
getEMFLLockStatus()5582 const bool* ExtentMap::getEMFLLockStatus()
5583 {
5584     return &flLocked;
5585 }
5586 
getEMLockStatus()5587 const bool* ExtentMap::getEMLockStatus()
5588 {
5589     return &emLocked;
5590 }
5591 
5592 //------------------------------------------------------------------------------
5593 // Reload Config cache if config file time stamp has changed
5594 //------------------------------------------------------------------------------
checkReloadConfig()5595 void ExtentMap::checkReloadConfig()
5596 {
5597     config::Config* cf = config::Config::makeConfig();
5598 
5599     // Immediately return if Columnstore.xml timestamp has not changed
5600     if (cf->getCurrentMTime() == fCacheTime)
5601         return;
5602 
5603     //--------------------------------------------------------------------------
5604     // Initialize outdated attribute still used by primitiveserver.
5605     // Hardcode to 8K for now, since that's all we support.
5606     //--------------------------------------------------------------------------
5607     ExtentSize = 0x2000;
5608 
5609 //	string es = cf->getConfig("ExtentMap", "ExtentSize");
5610 //	if (es.length() == 0) es = "8K";
5611 //	if (es == "8K" || es == "8k")
5612 //	{
5613 //		ExtentSize = 0x2000;
5614 //	}
5615 //	else if (es == "1K" || es == "1k")
5616 //	{
5617 //		ExtentSize = 0x400;
5618 //	}
5619 //	else if (es == "64K" || es == "64k")
5620 //	{
5621 //		ExtentSize = 0x10000;
5622 //	}
5623 //	else
5624 //	{
5625 //		throw logic_error("Invalid ExtentSize found in config file!");
5626 //	}
5627 
5628     //--------------------------------------------------------------------------
5629     // Initialize number of rows per extent
5630     // Hardcode to 8M for now, since that's all we support.
5631     //--------------------------------------------------------------------------
5632     ExtentRows = 0x800000;
5633 
5634 //	string er = cf->getConfig("ExtentMap", "ExtentRows");
5635 //	if (er.length() == 0) er = "8M";
5636 //	if (er == "8M" || er == "8m")
5637 //	{
5638 //		ExtentRows = 0x800000;
5639 //	}
5640 //	else if (er == "1M" || er == "1m")
5641 //	{
5642 //		ExtentRows = 0x100000;
5643 //	}
5644 //	else if (er == "64M" || er == "64m")
5645 //	{
5646 //		ExtentRows = 0x4000000;
5647 //	}
5648 //	else
5649 //	{
5650 //		throw logic_error("Invalid ExtentRows found in config file!");
5651 //	}
5652 
5653     //--------------------------------------------------------------------------
5654     // Initialize segment files per physical partition
5655     //--------------------------------------------------------------------------
5656     string fpc = cf->getConfig("ExtentMap", "FilesPerColumnPartition");
5657     filesPerColumnPartition = cf->uFromText(fpc);
5658 
5659     if (filesPerColumnPartition == 0)
5660         filesPerColumnPartition = 4;
5661 
5662     // Get latest Columnstore.xml timestamp after first access forced a reload
5663     fCacheTime = cf ->getLastMTime();
5664 
5665     //--------------------------------------------------------------------------
5666     // Initialize extents per segment file
5667     //--------------------------------------------------------------------------
5668     string epsf = cf->getConfig("ExtentMap", "ExtentsPerSegmentFile");
5669     extentsPerSegmentFile = cf->uFromText(epsf);
5670 
5671     if (extentsPerSegmentFile == 0)
5672         extentsPerSegmentFile = 2;
5673 }
5674 
5675 //------------------------------------------------------------------------------
5676 // Returns the number of extents in a segment file.
5677 // Mutex lock and call to checkReloadConfig() not currently necessary since,
5678 // going with hardcoded value.  See checkReloadConfig().
5679 //------------------------------------------------------------------------------
getExtentSize()5680 unsigned ExtentMap::getExtentSize()       // dmc-should deprecate
5681 {
5682 //	boost::mutex::scoped_lock lk(fConfigCacheMutex);
5683 //	checkReloadConfig( );
5684 
5685     ExtentSize = 0x2000;
5686     return ExtentSize;
5687 }
5688 
5689 //------------------------------------------------------------------------------
5690 // Returns the number or rows per extent.  Only supported values are 1m, 8m,
5691 // and 64m.
5692 // Mutex lock and call to checkReloadConfig() not currently necessary since,
5693 // going with hardcoded value.  See checkReloadConfig().
5694 //------------------------------------------------------------------------------
getExtentRows()5695 unsigned ExtentMap::getExtentRows()
5696 {
5697 //	boost::mutex::scoped_lock lk(fConfigCacheMutex);
5698 //	checkReloadConfig( );
5699 
5700     ExtentRows = 0x800000;
5701     return ExtentRows;
5702 }
5703 
5704 //------------------------------------------------------------------------------
5705 // Returns the number of column segment files for an OID, that make up a
5706 // partition.
5707 //------------------------------------------------------------------------------
getFilesPerColumnPartition()5708 unsigned ExtentMap::getFilesPerColumnPartition()
5709 {
5710     boost::mutex::scoped_lock lk(fConfigCacheMutex);
5711     checkReloadConfig( );
5712 
5713     return filesPerColumnPartition;
5714 }
5715 
5716 //------------------------------------------------------------------------------
5717 // Returns the number of extents in a segment file.
5718 //------------------------------------------------------------------------------
getExtentsPerSegmentFile()5719 unsigned ExtentMap::getExtentsPerSegmentFile()
5720 {
5721     boost::mutex::scoped_lock lk(fConfigCacheMutex);
5722     checkReloadConfig( );
5723 
5724     return extentsPerSegmentFile;
5725 }
5726 
5727 //------------------------------------------------------------------------------
5728 // Returns the number of DBRoots to be used in storing db column files.
5729 //------------------------------------------------------------------------------
getDbRootCount()5730 unsigned ExtentMap::getDbRootCount()
5731 {
5732     oam::OamCache* oamcache = oam::OamCache::makeOamCache();
5733     unsigned int rootCnt = oamcache->getDBRootCount();
5734 
5735     return rootCnt;
5736 }
5737 
5738 //------------------------------------------------------------------------------
5739 // Get list of DBRoots that map to the specified PM.  DBRoot list is cached
5740 // internally in fPmDbRootMap after getting from Columnstore.xml via OAM.
5741 //------------------------------------------------------------------------------
getPmDbRoots(int pm,vector<int> & dbRootList)5742 void ExtentMap::getPmDbRoots( int pm, vector<int>& dbRootList )
5743 {
5744     oam::OamCache* oamcache = oam::OamCache::makeOamCache();
5745     oam::OamCache::PMDbrootsMap_t pmDbroots = oamcache->getPMToDbrootsMap();
5746 
5747     dbRootList.clear();
5748     dbRootList = (*pmDbroots)[pm];
5749 }
5750 
getFreeListEntries()5751 vector<InlineLBIDRange> ExtentMap::getFreeListEntries()
5752 {
5753     vector<InlineLBIDRange> v;
5754     grabEMEntryTable(READ);
5755     grabFreeList(READ);
5756 
5757     int allocdSize = fFLShminfo->allocdSize / sizeof(InlineLBIDRange);
5758 
5759     for (int i = 0; i < allocdSize; i++)
5760         v.push_back(fFreeList[i]);
5761 
5762     releaseFreeList(READ);
5763     releaseEMEntryTable(READ);
5764     return v;
5765 }
5766 
dumpTo(ostream & os)5767 void ExtentMap::dumpTo(ostream& os)
5768 {
5769     grabEMEntryTable(READ);
5770     unsigned emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry);
5771 
5772     for (unsigned i = 0; i < emEntries; i++)
5773     {
5774         if (fExtentMap[i].range.size != 0)
5775         {
5776             os << fExtentMap[i].range.start << '|'
5777                << fExtentMap[i].range.size << '|'
5778                << fExtentMap[i].fileID << '|'
5779                << fExtentMap[i].blockOffset << '|'
5780                << fExtentMap[i].HWM << '|'
5781                << fExtentMap[i].partitionNum << '|'
5782                << fExtentMap[i].segmentNum << '|'
5783                << fExtentMap[i].dbRoot << '|'
5784                << fExtentMap[i].colWid << '|'
5785                << fExtentMap[i].status << '|'
5786                << fExtentMap[i].partition.cprange.hi_val << '|'
5787                << fExtentMap[i].partition.cprange.lo_val << '|'
5788                << fExtentMap[i].partition.cprange.sequenceNum << '|'
5789                << (int)fExtentMap[i].partition.cprange.isValid << '|'
5790                << endl;
5791         }
5792     }
5793 
5794     releaseEMEntryTable(READ);
5795 }
5796 
5797 /*int ExtentMap::physicalPartitionNum(const set<OID_t>& oids,
5798 	                       const set<uint32_t>& partitionNums,
5799 	                       vector<PartitionInfo>& partitionInfos)
5800 {
5801 #ifdef BRM_INFO
5802 	if (fDebug)
5803 	{
5804 		TRACER_WRITENOW("physicalPartitionNum");
5805 		ostringstream oss;
5806 		set<uint32_t>::const_iterator partIt;
5807 		oss << "partitionNums: "
5808 		for (partIt=partitionNums.begin(); it!=partitionNums.end(); ++it)
5809 			oss << (*it) << " ";
5810 		oss << endl;
5811 		TRACER_WRITEDIRECT(oss.str());
5812 	}
5813 #endif
5814 
5815 	set<OID_t>::const_iterator it;
5816 	grabEMEntryTable(READ);
5817 
5818 	int emEntries = fEMShminfo->allocdSize/sizeof(struct EMEntry);
5819 	PartitionInfo partInfo;
5820 	vector<uint32_t> extents;
5821 	set<uint32_t> foundPartitions;
5822 	for (int i = 0; i < emEntries; i++)
5823 	{
5824 		if ((fExtentMap[i].range.size  != 0  ) &&
5825 			partitionNums.find(logicalPartitionNum(fExtentMap[i])) != partitionNums.end())
5826 		{
5827 			it = oids.find( fExtentMap[i].fileID );
5828 			if (it != oids.end())
5829 			{
5830 				partInfo.oid = fExtentMap[i].fileID;
5831 				partInfo.lp.dbroot = fExtentMap[i].dbRoot;
5832 				partInfo.lp.pp = fExtentMap[i].partitionNum;
5833 				partInfo.lp.seg = fExtentMap[i].segmentNum;
5834 				partitionInfos.push_back(partInfo);
5835 			}
5836 		}
5837 	}
5838 	releaseEMEntryTable(READ);
5839 	return 0;
5840 }
5841 */
5842 
5843 }	//namespace
5844 // vim:ts=4 sw=4:
5845 
5846