1###################### 2Hierarchical Documents 3###################### 4 5:doc:`Python <hierarchical-documents>` **Java** 6 7Goal 8==== 9 10Create a representation for hierarchical `documents <http://en.wikipedia.org/wiki/Document-oriented_database>`_. 11 12Challenge 13========= 14 15Support efficient storage and retrieval of documents, both as a whole and by subdocuments specified by paths. 16 17Explanation 18=========== 19 20A hierarchical document has a tree-like structure, with the document ID as the root. We'll map the hierarchy to a list of tuples in which each tuple corresponds to a path from the root to a leaf. These tuples will form keys, so each leaf is indexed by the path leading to it. 21 22Ordering 23======== 24 25Because each tuple represents a path from the document root to a leaf, the lexicographic ordering of tuples guarantees that adjacent paths will be stored in adjacent keys. Each tuple prefix will correspond to a subdocument that can be retrieved using a prefix range read. Likewise, a range read using the root as a prefix will retrieve the entire document. 26 27Pattern 28======= 29 30A document will consist of a dictionary whose values may be simple data types (e.g., integers or strings), lists, or (nested) dictionaries. Each document will be stored under a unique ID. If a document ID has not already been supplied, we randomly generate one. 31 32We convert the document to a list of tuples representing each path from the root to a leaf. Each tuple is used to construct a composite key within a subspace. The document ID becomes the first element after the subspace prefix, followed by the remainder of the path. We store the leaf (the last element of the tuple) as the value, which enables storage of larger data sizes (see :ref:`Key and value sizes <data-modeling-performance-guidelines>`). 33 34If we're given a serialized JSON object to start with, we just deserialize it before converting it to tuples. To distinguish the list elements in the document (a.k.a. JSON arrays) from dictionary elements and preserve the order of the lists, we include the index of each list element before it in the tuple. 35 36We can retrieve any subdocument based on the partial path to its root. The partial path will just be a tuple that the query function uses as a key prefix for a range read. The retrieved data will be a list of tuples. The final step before returning the data is to convert it back to a document. 37 38Extensions 39========== 40 41Indexing 42-------- 43 44We could extend the document model to allow selective indexing of keys or values at specified locations with a document. 45 46Query language 47-------------- 48 49We could extend the document to support more powerful query capabilities, either with query functions or a full query language. Either would be designed to take advantage of existing indexes. 50 51Code 52==== 53 54Here’s a basic implementation of the recipe. 55 56.. code-block:: java 57 58 import java.util.ArrayList; 59 import java.util.HashMap; 60 import java.util.Map; 61 import java.util.Map.Entry; 62 63 public class MicroDoc { 64 65 private static final FDB fdb; 66 private static final Database db; 67 private static final Subspace docSpace; 68 private static final long EMPTY_OBJECT = -2; 69 private static final long EMPTY_ARRAY = -1; 70 71 static { 72 fdb = FDB.selectAPIVersion(610); 73 db = fdb.open(); 74 docSpace = new Subspace(Tuple.from("D")); 75 } 76 77 @SuppressWarnings("unchecked") 78 private static ArrayList<Tuple> toTuplesSwitch(Object o){ 79 if(o instanceof ArrayList){ 80 return toTuples((ArrayList<Object>) o); 81 } else if(o instanceof Map){ 82 return toTuples((Map<Object,Object>) o); 83 } else { 84 return toTuples(o); 85 } 86 } 87 88 private static ArrayList<Tuple> toTuples(ArrayList<Object> item){ 89 if(item.isEmpty()){ 90 ArrayList<Tuple> val = new ArrayList<Tuple>(); 91 val.add(Tuple.from(EMPTY_ARRAY, null)); 92 return val; 93 } else { 94 ArrayList<Tuple> val = new ArrayList<Tuple>(); 95 for(int i = 0; i < item.size(); i++){ 96 for(Tuple sub : toTuplesSwitch(item.get(i))){ 97 val.add(Tuple.from(i).addAll(sub)); 98 } 99 } 100 return val; 101 } 102 } 103 104 private static ArrayList<Tuple> toTuples(Map<Object,Object> item){ 105 if(item.isEmpty()){ 106 ArrayList<Tuple> val = new ArrayList<Tuple>(); 107 val.add(Tuple.from(EMPTY_OBJECT, null)); 108 return val; 109 } else { 110 ArrayList<Tuple> val = new ArrayList<Tuple>(); 111 for(Entry<Object,Object> e : item.entrySet()){ 112 for(Tuple sub : toTuplesSwitch(e.getValue())){ 113 val.add(Tuple.from(e.getKey()).addAll(sub)); 114 } 115 } 116 return val; 117 } 118 } 119 120 private static ArrayList<Tuple> toTuples(Object item){ 121 ArrayList<Tuple> val = new ArrayList<Tuple>(); 122 val.add(Tuple.from(item)); 123 return val; 124 } 125 126 private static ArrayList<Tuple> getTruncated(ArrayList<Tuple> vals){ 127 ArrayList<Tuple> list = new ArrayList<Tuple>(); 128 for(Tuple val : vals){ 129 list.add(val.popFront()); 130 } 131 return list; 132 } 133 134 private static Object fromTuples(ArrayList<Tuple> tuples){ 135 if(tuples == null){ 136 return null; 137 } 138 139 Tuple first = tuples.get(0); // Determine kind of object from 140 // first tuple. 141 if(first.size() == 1){ 142 return first.get(0); // Primitive type. 143 } 144 145 if(first.equals(Tuple.from(EMPTY_OBJECT, null))){ 146 return new HashMap<Object,Object>(); // Empty map. 147 } 148 149 if(first.equals(Tuple.from(EMPTY_ARRAY))){ 150 return new ArrayList<Object>(); // Empty list. 151 } 152 153 HashMap<Object,ArrayList<Tuple>> groups = new HashMap<Object,ArrayList<Tuple>>(); 154 for(Tuple t : tuples){ 155 if(groups.containsKey(t.get(0))){ 156 groups.get(t.get(0)).add(t); 157 } else { 158 ArrayList<Tuple> list = new ArrayList<Tuple>(); 159 list.add(t); 160 groups.put(t.get(0),list); 161 } 162 } 163 164 if(first.get(0).equals(0l)){ 165 // Array. 166 ArrayList<Object> array = new ArrayList<Object>(); 167 for(Entry<Object,ArrayList<Tuple>> g : groups.entrySet()){ 168 array.add(fromTuples(getTruncated(g.getValue()))); 169 } 170 return array; 171 } else { 172 // Object. 173 HashMap<Object,Object> map = new HashMap<Object,Object>(); 174 for(Entry<Object,ArrayList<Tuple>> g : groups.entrySet()){ 175 map.put(g.getKey(), fromTuples(getTruncated(g.getValue()))); 176 } 177 return map; 178 } 179 } 180 181 public static Object insertDoc(TransactionContext tcx, final Map<Object,Object> doc){ 182 return tcx.run(tr -> { 183 if(!doc.containsKey("doc_id")){ 184 doc.put("doc_id", getNewID(tr)); 185 } 186 for(Tuple t : toTuples(doc)){ 187 tr.set(docSpace.pack(Tuple.from(doc.get("doc_id")).addAll(t.popBack())), 188 Tuple.from(t.get(t.size() - 1)).pack()); 189 } 190 return doc.get("doc_id"); 191 }); 192 } 193 194 public static Object getDoc(TransactionContext tcx, final Object ID){ 195 return getDoc(tcx, ID, Tuple.from()); 196 } 197 198 public static Object getDoc(TransactionContext tcx, final Object ID, final Tuple prefix){ 199 return tcx.run(tr -> { 200 Future<byte[]> v = tr.get(docSpace.pack(Tuple.from(ID).addAll(prefix))); 201 if(v.get() != null){ 202 // One single item. 203 ArrayList<Tuple> vals = new ArrayList<Tuple>(); 204 vals.add(prefix.addAll(Tuple.fromBytes(v.get()))); 205 return fromTuples(vals); 206 } else { 207 // Multiple items. 208 ArrayList<Tuple> vals = new ArrayList<Tuple>(); 209 for(KeyValue kv : tr.getRange(docSpace.range(Tuple.from(ID).addAll(prefix)))){ 210 vals.add(docSpace.unpack(kv.getKey()).popFront().addAll(Tuple.fromBytes(kv.getValue()))); 211 } 212 return fromTuples(vals); 213 } 214 }); 215 } 216 217 private static int getNewID(TransactionContext tcx){ 218 return tcx.run(tr -> { 219 boolean found = false; 220 int newID; 221 do { 222 newID = (int)(Math.random()*100000000); 223 found = true; 224 for(KeyValue kv : tr.getRange(docSpace.range(Tuple.from(newID)))){ 225 // If not empty, this is false. 226 found = false; 227 break; 228 } 229 } while(!found); 230 return newID; 231 }); 232 } 233 } 234