1######################
2Hierarchical Documents
3######################
4
5:doc:`Python <hierarchical-documents>` **Java**
6
7Goal
8====
9
10Create a representation for hierarchical `documents <http://en.wikipedia.org/wiki/Document-oriented_database>`_.
11
12Challenge
13=========
14
15Support efficient storage and retrieval of documents, both as a whole and by subdocuments specified by paths.
16
17Explanation
18===========
19
20A hierarchical document has a tree-like structure, with the document ID as the root. We'll map the hierarchy to a list of tuples in which each tuple corresponds to a path from the root to a leaf. These tuples will form keys, so each leaf is indexed by the path leading to it.
21
22Ordering
23========
24
25Because each tuple represents a path from the document root to a leaf, the lexicographic ordering of tuples guarantees that adjacent paths will be stored in adjacent keys. Each tuple prefix will correspond to a subdocument that can be retrieved using a prefix range read. Likewise, a range read using the root as a prefix will retrieve the entire document.
26
27Pattern
28=======
29
30A document will consist of a dictionary whose values may be simple data types (e.g., integers or strings), lists, or (nested) dictionaries. Each document will be stored under a unique ID. If a document ID has not already been supplied, we randomly generate one.
31
32We convert the document to a list of tuples representing each path from the root to a leaf. Each tuple is used to construct a composite key within a subspace. The document ID becomes the first element after the subspace prefix, followed by the remainder of the path. We store the leaf (the last element of the tuple) as the value, which enables storage of larger data sizes (see :ref:`Key and value sizes <data-modeling-performance-guidelines>`).
33
34If we're given a serialized JSON object to start with, we just deserialize it before converting it to tuples. To distinguish the list elements in the document (a.k.a. JSON arrays) from dictionary elements and preserve the order of the lists, we include the index of each list element before it in the tuple.
35
36We can retrieve any subdocument based on the partial path to its root. The partial path will just be a tuple that the query function uses as a key prefix for a range read. The retrieved data will be a list of tuples. The final step before returning the data is to convert it back to a document.
37
38Extensions
39==========
40
41Indexing
42--------
43
44We could extend the document model to allow selective indexing of keys or values at specified locations with a document.
45
46Query language
47--------------
48
49We could extend the document to support more powerful query capabilities, either with query functions or a full query language. Either would be designed to take advantage of existing indexes.
50
51Code
52====
53
54Here’s a basic implementation of the recipe.
55
56.. code-block:: java
57
58    import java.util.ArrayList;
59    import java.util.HashMap;
60    import java.util.Map;
61    import java.util.Map.Entry;
62
63    public class MicroDoc {
64
65        private static final FDB fdb;
66        private static final Database db;
67        private static final Subspace docSpace;
68        private static final long EMPTY_OBJECT = -2;
69        private static final long EMPTY_ARRAY = -1;
70
71        static {
72            fdb = FDB.selectAPIVersion(610);
73            db = fdb.open();
74            docSpace = new Subspace(Tuple.from("D"));
75        }
76
77        @SuppressWarnings("unchecked")
78        private static ArrayList<Tuple> toTuplesSwitch(Object o){
79            if(o instanceof ArrayList){
80                return toTuples((ArrayList<Object>) o);
81            } else if(o instanceof Map){
82                return toTuples((Map<Object,Object>) o);
83            } else {
84                return toTuples(o);
85            }
86        }
87
88        private static ArrayList<Tuple> toTuples(ArrayList<Object> item){
89            if(item.isEmpty()){
90                ArrayList<Tuple> val = new ArrayList<Tuple>();
91                val.add(Tuple.from(EMPTY_ARRAY, null));
92                return val;
93            } else {
94                ArrayList<Tuple> val = new ArrayList<Tuple>();
95                for(int i = 0; i < item.size(); i++){
96                    for(Tuple sub : toTuplesSwitch(item.get(i))){
97                        val.add(Tuple.from(i).addAll(sub));
98                    }
99                }
100                return val;
101            }
102        }
103
104        private static ArrayList<Tuple> toTuples(Map<Object,Object> item){
105            if(item.isEmpty()){
106                ArrayList<Tuple> val = new ArrayList<Tuple>();
107                val.add(Tuple.from(EMPTY_OBJECT, null));
108                return val;
109            } else {
110                ArrayList<Tuple> val = new ArrayList<Tuple>();
111                for(Entry<Object,Object> e : item.entrySet()){
112                    for(Tuple sub : toTuplesSwitch(e.getValue())){
113                        val.add(Tuple.from(e.getKey()).addAll(sub));
114                    }
115                }
116                return val;
117            }
118        }
119
120        private static ArrayList<Tuple> toTuples(Object item){
121            ArrayList<Tuple> val = new ArrayList<Tuple>();
122            val.add(Tuple.from(item));
123            return val;
124        }
125
126        private static ArrayList<Tuple> getTruncated(ArrayList<Tuple> vals){
127            ArrayList<Tuple> list = new ArrayList<Tuple>();
128            for(Tuple val : vals){
129                list.add(val.popFront());
130            }
131            return list;
132        }
133
134        private static Object fromTuples(ArrayList<Tuple> tuples){
135            if(tuples == null){
136                return null;
137            }
138
139            Tuple first = tuples.get(0); // Determine kind of object from
140                                         // first tuple.
141            if(first.size() == 1){
142                return first.get(0); // Primitive type.
143            }
144
145            if(first.equals(Tuple.from(EMPTY_OBJECT, null))){
146                return new HashMap<Object,Object>(); // Empty map.
147            }
148
149            if(first.equals(Tuple.from(EMPTY_ARRAY))){
150                return new ArrayList<Object>(); // Empty list.
151            }
152
153            HashMap<Object,ArrayList<Tuple>> groups = new HashMap<Object,ArrayList<Tuple>>();
154            for(Tuple t : tuples){
155                if(groups.containsKey(t.get(0))){
156                    groups.get(t.get(0)).add(t);
157                } else {
158                    ArrayList<Tuple> list = new ArrayList<Tuple>();
159                    list.add(t);
160                    groups.put(t.get(0),list);
161                }
162            }
163
164            if(first.get(0).equals(0l)){
165                // Array.
166                ArrayList<Object> array = new ArrayList<Object>();
167                for(Entry<Object,ArrayList<Tuple>> g : groups.entrySet()){
168                    array.add(fromTuples(getTruncated(g.getValue())));
169                }
170                return array;
171            } else {
172                // Object.
173                HashMap<Object,Object> map = new HashMap<Object,Object>();
174                for(Entry<Object,ArrayList<Tuple>> g : groups.entrySet()){
175                    map.put(g.getKey(), fromTuples(getTruncated(g.getValue())));
176                }
177                return map;
178            }
179        }
180
181        public static Object insertDoc(TransactionContext tcx, final Map<Object,Object> doc){
182            return tcx.run(tr -> {
183                if(!doc.containsKey("doc_id")){
184                    doc.put("doc_id", getNewID(tr));
185                }
186                for(Tuple t : toTuples(doc)){
187                    tr.set(docSpace.pack(Tuple.from(doc.get("doc_id")).addAll(t.popBack())),
188                            Tuple.from(t.get(t.size() - 1)).pack());
189                }
190                return doc.get("doc_id");
191            });
192        }
193
194        public static Object getDoc(TransactionContext tcx, final Object ID){
195            return getDoc(tcx, ID, Tuple.from());
196        }
197
198        public static Object getDoc(TransactionContext tcx, final Object ID, final Tuple prefix){
199            return tcx.run(tr -> {
200                Future<byte[]> v = tr.get(docSpace.pack(Tuple.from(ID).addAll(prefix)));
201                if(v.get() != null){
202                    // One single item.
203                    ArrayList<Tuple> vals = new ArrayList<Tuple>();
204                    vals.add(prefix.addAll(Tuple.fromBytes(v.get())));
205                    return fromTuples(vals);
206                } else {
207                    // Multiple items.
208                    ArrayList<Tuple> vals =  new ArrayList<Tuple>();
209                    for(KeyValue kv : tr.getRange(docSpace.range(Tuple.from(ID).addAll(prefix)))){
210                        vals.add(docSpace.unpack(kv.getKey()).popFront().addAll(Tuple.fromBytes(kv.getValue())));
211                    }
212                    return fromTuples(vals);
213                }
214            });
215        }
216
217        private static int getNewID(TransactionContext tcx){
218            return tcx.run(tr -> {
219                boolean found = false;
220                int newID;
221                do {
222                    newID = (int)(Math.random()*100000000);
223                    found = true;
224                    for(KeyValue kv : tr.getRange(docSpace.range(Tuple.from(newID)))){
225                        // If not empty, this is false.
226                        found = false;
227                        break;
228                    }
229                } while(!found);
230                return newID;
231            });
232        }
233    }
234