1// copyright (C) 2004 graydon hoare <graydon@pobox.com>
2// all rights reserved.
3// licensed to the public under the terms of the GNU GPL (>= 2)
4// see the file COPYING for details
5
6#include <cctype>
7#include <cstdlib>
8#include <iostream>
9#include <map>
10#include <set>
11#include <sstream>
12#include <string>
13
14#include <boost/lexical_cast.hpp>
15#include <boost/dynamic_bitset.hpp>
16#include <boost/shared_ptr.hpp>
17
18#include "basic_io.hh"
19#include "change_set.hh"
20#include "constants.hh"
21#include "revision.hh"
22#include "sanity.hh"
23#include "transforms.hh"
24#include "vocab.hh"
25
26
27// calculating least common ancestors is a delicate thing.
28//
29// it turns out that we cannot choose the simple "least common ancestor"
30// for purposes of a merge, because it is possible that there are two
31// equally reachable common ancestors, and this produces ambiguity in the
32// merge. the result -- in a pathological case -- is silently accepting one
33// set of edits while discarding another; not exactly what you want a
34// version control tool to do.
35//
36// a conservative approximation is what we'll call a "subgraph recurring"
37// LCA algorithm. this is somewhat like locating the least common dominator
38// node, but not quite. it is actually just a vanilla LCA search, except
39// that any time there's a fork (a historical merge looks like a fork from
40// our perspective, working backwards from children to parents) it reduces
41// the fork to a common parent via a sequence of pairwise recursive calls
42// to itself before proceeding. this will always resolve to a common parent
43// with no ambiguity, unless it falls off the root of the graph.
44//
45// unfortunately the subgraph recurring algorithm sometimes goes too far
46// back in history -- for example if there is an unambiguous propagate from
47// one branch to another, the entire subgraph preceeding the propagate on
48// the recipient branch is elided, since it is a merge.
49//
50// our current hypothesis is that the *exact* condition we're looking for,
51// when doing a merge, is the least node which dominates one side of the
52// merge and is an ancestor of the other.
53
54typedef unsigned long ctx;
55typedef boost::dynamic_bitset<> bitmap;
56typedef boost::shared_ptr<bitmap> shared_bitmap;
57
58static void
59ensure_parents_loaded(ctx child,
60                      std::map<ctx, shared_bitmap> & parents,
61                      interner<ctx> & intern,
62                      app_state & app)
63{
64  if (parents.find(child) != parents.end())
65    return;
66
67  L(F("loading parents for node %d\n") % child);
68
69  std::set<revision_id> imm_parents;
70  app.db.get_revision_parents(revision_id(intern.lookup(child)), imm_parents);
71
72  // The null revision is not a parent for purposes of finding common
73  // ancestors.
74  for (std::set<revision_id>::iterator p = imm_parents.begin();
75       p != imm_parents.end(); ++p)
76    {
77      if (null_id(*p))
78        imm_parents.erase(p);
79    }
80
81  shared_bitmap bits = shared_bitmap(new bitmap(parents.size()));
82
83  for (std::set<revision_id>::const_iterator p = imm_parents.begin();
84       p != imm_parents.end(); ++p)
85    {
86      ctx pn = intern.intern(p->inner()());
87      L(F("parent %s -> node %d\n") % *p % pn);
88      if (pn >= bits->size())
89        bits->resize(pn+1);
90      bits->set(pn);
91    }
92
93  parents.insert(std::make_pair(child, bits));
94}
95
96static bool
97expand_dominators(std::map<ctx, shared_bitmap> & parents,
98                  std::map<ctx, shared_bitmap> & dominators,
99                  interner<ctx> & intern,
100                  app_state & app)
101{
102  bool something_changed = false;
103  std::vector<ctx> nodes;
104
105  nodes.reserve(dominators.size());
106
107  // pass 1, pull out all the node numbers we're going to scan this time around
108  for (std::map<ctx, shared_bitmap>::const_iterator e = dominators.begin();
109       e != dominators.end(); ++e)
110    nodes.push_back(e->first);
111
112  // pass 2, update any of the dominator entries we can
113  for (std::vector<ctx>::const_iterator n = nodes.begin();
114       n != nodes.end(); ++n)
115    {
116      shared_bitmap bits = dominators[*n];
117      bitmap saved(*bits);
118      if (bits->size() <= *n)
119        bits->resize(*n + 1);
120      bits->set(*n);
121
122      ensure_parents_loaded(*n, parents, intern, app);
123      shared_bitmap n_parents = parents[*n];
124
125      bitmap intersection(bits->size());
126
127      bool first = true;
128      for (unsigned long parent = 0;
129           parent != n_parents->size(); ++parent)
130        {
131          if (! n_parents->test(parent))
132            continue;
133
134          if (dominators.find(parent) == dominators.end())
135            dominators.insert(std::make_pair(parent,
136                                             shared_bitmap(new bitmap())));
137          shared_bitmap pbits = dominators[parent];
138
139          if (bits->size() > pbits->size())
140            pbits->resize(bits->size());
141
142          if (pbits->size() > bits->size())
143            bits->resize(pbits->size());
144
145          if (first)
146            {
147              intersection = (*pbits);
148              first = false;
149            }
150          else
151            intersection &= (*pbits);
152        }
153
154      (*bits) |= intersection;
155      if (*bits != saved)
156        something_changed = true;
157    }
158  return something_changed;
159}
160
161
162static bool
163expand_ancestors(std::map<ctx, shared_bitmap> & parents,
164                 std::map<ctx, shared_bitmap> & ancestors,
165                 interner<ctx> & intern,
166                 app_state & app)
167{
168  bool something_changed = false;
169  std::vector<ctx> nodes;
170
171  nodes.reserve(ancestors.size());
172
173  // pass 1, pull out all the node numbers we're going to scan this time around
174  for (std::map<ctx, shared_bitmap>::const_iterator e = ancestors.begin();
175       e != ancestors.end(); ++e)
176    nodes.push_back(e->first);
177
178  // pass 2, update any of the ancestor entries we can
179  for (std::vector<ctx>::const_iterator n = nodes.begin(); n != nodes.end(); ++n)
180    {
181      shared_bitmap bits = ancestors[*n];
182      bitmap saved(*bits);
183      if (bits->size() <= *n)
184        bits->resize(*n + 1);
185      bits->set(*n);
186
187      ensure_parents_loaded(*n, parents, intern, app);
188      shared_bitmap n_parents = parents[*n];
189      for (ctx parent = 0; parent != n_parents->size(); ++parent)
190        {
191          if (! n_parents->test(parent))
192            continue;
193
194          if (bits->size() <= parent)
195            bits->resize(parent + 1);
196          bits->set(parent);
197
198          if (ancestors.find(parent) == ancestors.end())
199            ancestors.insert(make_pair(parent,
200                                        shared_bitmap(new bitmap())));
201          shared_bitmap pbits = ancestors[parent];
202
203          if (bits->size() > pbits->size())
204            pbits->resize(bits->size());
205
206          if (pbits->size() > bits->size())
207            bits->resize(pbits->size());
208
209          (*bits) |= (*pbits);
210        }
211      if (*bits != saved)
212        something_changed = true;
213    }
214  return something_changed;
215}
216
217static bool
218find_intersecting_node(bitmap & fst,
219                       bitmap & snd,
220                       interner<ctx> const & intern,
221                       revision_id & anc)
222{
223
224  if (fst.size() > snd.size())
225    snd.resize(fst.size());
226  else if (snd.size() > fst.size())
227    fst.resize(snd.size());
228
229  bitmap intersection = fst & snd;
230  if (intersection.any())
231    {
232      L(F("found %d intersecting nodes\n") % intersection.count());
233      for (ctx i = 0; i < intersection.size(); ++i)
234        {
235          if (intersection.test(i))
236            {
237              anc = revision_id(intern.lookup(i));
238              return true;
239            }
240        }
241    }
242  return false;
243}
244
245//  static void
246//  dump_bitset_map(std::string const & hdr,
247//              std::map< ctx, shared_bitmap > const & mm)
248//  {
249//    L(F("dumping [%s] (%d entries)\n") % hdr % mm.size());
250//    for (std::map< ctx, shared_bitmap >::const_iterator i = mm.begin();
251//         i != mm.end(); ++i)
252//      {
253//        L(F("dump [%s]: %d -> %s\n") % hdr % i->first % (*(i->second)));
254//      }
255//  }
256
257bool
258find_common_ancestor_for_merge(revision_id const & left,
259                               revision_id const & right,
260                               revision_id & anc,
261                               app_state & app)
262{
263  interner<ctx> intern;
264  std::map< ctx, shared_bitmap >
265    parents, ancestors, dominators;
266
267  ctx ln = intern.intern(left.inner()());
268  ctx rn = intern.intern(right.inner()());
269
270  shared_bitmap lanc = shared_bitmap(new bitmap());
271  shared_bitmap ranc = shared_bitmap(new bitmap());
272  shared_bitmap ldom = shared_bitmap(new bitmap());
273  shared_bitmap rdom = shared_bitmap(new bitmap());
274
275  ancestors.insert(make_pair(ln, lanc));
276  ancestors.insert(make_pair(rn, ranc));
277  dominators.insert(make_pair(ln, ldom));
278  dominators.insert(make_pair(rn, rdom));
279
280  L(F("searching for common ancestor, left=%s right=%s\n") % left % right);
281
282  while (expand_ancestors(parents, ancestors, intern, app) ||
283         expand_dominators(parents, dominators, intern, app))
284    {
285      L(F("common ancestor scan [par=%d,anc=%d,dom=%d]\n") %
286        parents.size() % ancestors.size() % dominators.size());
287
288      if (find_intersecting_node(*lanc, *rdom, intern, anc))
289        {
290          L(F("found node %d, ancestor of left %s and dominating right %s\n")
291            % anc % left % right);
292          return true;
293        }
294
295      else if (find_intersecting_node(*ranc, *ldom, intern, anc))
296        {
297          L(F("found node %d, ancestor of right %s and dominating left %s\n")
298            % anc % right % left);
299          return true;
300        }
301    }
302//      dump_bitset_map("ancestors", ancestors);
303//      dump_bitset_map("dominators", dominators);
304//      dump_bitset_map("parents", parents);
305  return false;
306}
307
308
309bool
310find_least_common_ancestor(revision_id const & left,
311                           revision_id const & right,
312                           revision_id & anc,
313                           app_state & app)
314{
315  interner<ctx> intern;
316  std::map< ctx, shared_bitmap >
317    parents, ancestors, dominators;
318
319  ctx ln = intern.intern(left.inner()());
320  ctx rn = intern.intern(right.inner()());
321
322  shared_bitmap lanc = shared_bitmap(new bitmap());
323  shared_bitmap ranc = shared_bitmap(new bitmap());
324
325  ancestors.insert(make_pair(ln, lanc));
326  ancestors.insert(make_pair(rn, ranc));
327
328  L(F("searching for least common ancestor, left=%s right=%s\n") % left % right);
329
330  while (expand_ancestors(parents, ancestors, intern, app))
331    {
332      L(F("least common ancestor scan [par=%d,anc=%d]\n") %
333        parents.size() % ancestors.size());
334
335      if (find_intersecting_node(*lanc, *ranc, intern, anc))
336        {
337          L(F("found node %d, ancestor of left %s and right %s\n")
338            % anc % left % right);
339          return true;
340        }
341    }
342//      dump_bitset_map("ancestors", ancestors);
343//      dump_bitset_map("parents", parents);
344  return false;
345}
346
347
348//
349// The idea with this algorithm is to walk from child up to ancestor,
350// recursively, accumulating all the change_sets associated with
351// intermediate nodes into *one big change_set*.
352//
353// clever readers will realize this is an overlapping-subproblem type
354// situation and thus needs to keep a dynamic programming map to keep
355// itself in linear complexity.
356//
357// in fact, we keep two: one which maps to computed results (partial_csets)
358// and one which just keeps a set of all nodes we traversed
359// (visited_nodes). in theory it could be one map with an extra bool stuck
360// on each entry, but I think that would make it even less readable. it's
361// already quite ugly.
362//
363
364static bool
365calculate_change_sets_recursive(revision_id const & ancestor,
366                                revision_id const & child,
367                                app_state & app,
368                                change_set & cumulative_cset,
369                                std::map<revision_id, boost::shared_ptr<change_set> > & partial_csets,
370                                std::set<revision_id> & visited_nodes)
371{
372
373  if (ancestor == child)
374    return true;
375
376  visited_nodes.insert(child);
377
378  bool relevant_child = false;
379
380  revision_set rev;
381  app.db.get_revision(child, rev);
382
383  L(F("exploring changesets from parents of %s, seeking towards %s\n")
384    % child % ancestor);
385
386  for(edge_map::const_iterator i = rev.edges.begin(); i != rev.edges.end(); ++i)
387    {
388      bool relevant_parent = false;
389      revision_id curr_parent = edge_old_revision(i);
390
391      if (curr_parent.inner()().empty())
392        continue;
393
394      change_set cset_to_curr_parent;
395
396      L(F("considering parent %s of %s\n") % curr_parent % child);
397
398      std::map<revision_id, boost::shared_ptr<change_set> >::const_iterator j =
399        partial_csets.find(curr_parent);
400      if (j != partial_csets.end())
401        {
402          // a recursive call has traversed this parent before and built an
403          // existing cset. just reuse that rather than re-traversing
404          cset_to_curr_parent = *(j->second);
405          relevant_parent = true;
406        }
407      else if (visited_nodes.find(curr_parent) != visited_nodes.end())
408        {
409          // a recursive call has traversed this parent, but there was no
410          // path from it to the root, so the parent is irrelevant. skip.
411          relevant_parent = false;
412        }
413      else
414        relevant_parent = calculate_change_sets_recursive(ancestor, curr_parent, app,
415                                                          cset_to_curr_parent,
416                                                          partial_csets,
417                                                          visited_nodes);
418
419      if (relevant_parent)
420        {
421          L(F("revision %s is relevant, composing with edge to %s\n")
422            % curr_parent % child);
423          concatenate_change_sets(cset_to_curr_parent, edge_changes(i), cumulative_cset);
424          relevant_child = true;
425          break;
426        }
427      else
428        L(F("parent %s of %s is not relevant\n") % curr_parent % child);
429    }
430
431  // store the partial edge from ancestor -> child, so that if anyone
432  // re-traverses this edge they'll just fetch from the partial_edges
433  // cache.
434  if (relevant_child)
435    partial_csets.insert(std::make_pair(child,
436                                        boost::shared_ptr<change_set>
437                                        (new change_set(cumulative_cset))));
438
439  return relevant_child;
440}
441
442void
443calculate_composite_change_set(revision_id const & ancestor,
444                               revision_id const & child,
445                               app_state & app,
446                               change_set & composed)
447{
448  L(F("calculating composite changeset between %s and %s\n")
449    % ancestor % child);
450  std::set<revision_id> visited;
451  std::map<revision_id, boost::shared_ptr<change_set> > partial;
452  calculate_change_sets_recursive(ancestor, child, app, composed, partial, visited);
453}
454
455// migration stuff
456//
457// FIXME: these are temporary functions, once we've done the migration to
458// revisions / changesets, we can remove them.
459
460static void
461analyze_manifest_changes(app_state & app,
462                         manifest_id const & parent,
463                         manifest_id const & child,
464                         change_set & cs)
465{
466  manifest_map m_parent, m_child;
467  app.db.get_manifest(parent, m_parent);
468  app.db.get_manifest(child, m_child);
469
470  for (manifest_map::const_iterator i = m_parent.begin();
471       i != m_parent.end(); ++i)
472    {
473      manifest_map::const_iterator j = m_child.find(manifest_entry_path(i));
474      if (j == m_child.end())
475        cs.delete_file(manifest_entry_path(i));
476      else if (! (manifest_entry_id(i) == manifest_entry_id(j)))
477        {
478          cs.apply_delta(manifest_entry_path(i),
479                         manifest_entry_id(i),
480                         manifest_entry_id(j));
481        }
482    }
483  for (manifest_map::const_iterator i = m_child.begin();
484       i != m_child.end(); ++i)
485    {
486      manifest_map::const_iterator j = m_parent.find(manifest_entry_path(i));
487      if (j == m_parent.end())
488        cs.add_file(manifest_entry_path(i),
489                    manifest_entry_id(i));
490    }
491}
492
493static revision_id
494construct_revisions(app_state & app,
495                    manifest_id const & child,
496                    std::multimap< manifest_id, manifest_id > const & ancestry,
497                    std::map<manifest_id, revision_id> & mapped)
498{
499  revision_set rev;
500  typedef std::multimap< manifest_id, manifest_id >::const_iterator ci;
501  std::pair<ci,ci> range = ancestry.equal_range(child);
502  for (ci i = range.first; i != range.second; ++i)
503    {
504      manifest_id parent(i->second);
505      revision_id parent_rid;
506      std::map<manifest_id, revision_id>::const_iterator j = mapped.find(parent);
507
508      if (j != mapped.end())
509        parent_rid = j->second;
510      else
511        {
512          parent_rid = construct_revisions(app, parent, ancestry, mapped);
513          P(F("inserting mapping %d : %s -> %s\n") % mapped.size() % parent % parent_rid);;
514          mapped.insert(std::make_pair(parent, parent_rid));
515        }
516
517      change_set cs;
518      analyze_manifest_changes(app, parent, child, cs);
519      rev.edges.insert(std::make_pair(parent_rid,
520                                      std::make_pair(parent, cs)));
521    }
522
523  revision_id rid;
524  if (rev.edges.empty())
525    {
526      P(F("ignoring empty revision for manifest %s\n") % child);
527      return rid;
528    }
529
530  rev.new_manifest = child;
531  calculate_ident(rev, rid);
532
533  if (!app.db.revision_exists (rid))
534    {
535      P(F("mapping manifest %s to revision %s\n") % child % rid);
536      app.db.put_revision(rid, rev);
537    }
538  else
539    {
540      P(F("skipping additional path to revision %s\n") % rid);
541    }
542
543  // now hoist all the interesting certs up to the revision
544  std::set<cert_name> cnames;
545  cnames.insert(cert_name(branch_cert_name));
546  cnames.insert(cert_name(date_cert_name));
547  cnames.insert(cert_name(author_cert_name));
548  cnames.insert(cert_name(tag_cert_name));
549  cnames.insert(cert_name(changelog_cert_name));
550  cnames.insert(cert_name(comment_cert_name));
551  cnames.insert(cert_name(testresult_cert_name));
552
553  std::vector< manifest<cert> > tmp;
554  app.db.get_manifest_certs(child, tmp);
555  erase_bogus_certs(tmp, app);
556  for (std::vector< manifest<cert> >::const_iterator i = tmp.begin();
557       i != tmp.end(); ++i)
558    {
559      if (cnames.find(i->inner().name) == cnames.end())
560        continue;
561      cert new_cert;
562      cert_value tv;
563      decode_base64(i->inner().value, tv);
564      make_simple_cert(rid.inner(), i->inner().name, tv, app, new_cert);
565      if (! app.db.revision_cert_exists(revision<cert>(new_cert)))
566        app.db.put_revision_cert(revision<cert>(new_cert));
567    }
568  return rid;
569}
570
571
572void
573build_changesets(app_state & app)
574{
575  std::vector< manifest<cert> > tmp;
576  app.db.get_manifest_certs(cert_name("ancestor"), tmp);
577  erase_bogus_certs(tmp, app);
578
579  std::multimap< manifest_id, manifest_id > ancestry;
580  std::set<manifest_id> heads;
581  std::set<manifest_id> total;
582
583  for (std::vector< manifest<cert> >::const_iterator i = tmp.begin();
584       i != tmp.end(); ++i)
585    {
586      cert_value tv;
587      decode_base64(i->inner().value, tv);
588      manifest_id child, parent;
589      child = i->inner().ident;
590      parent = hexenc<id>(tv());
591      heads.insert(child);
592      heads.erase(parent);
593      total.insert(child);
594      total.insert(parent);
595      ancestry.insert(std::make_pair(child, parent));
596    }
597
598  P(F("found a total of %d manifests\n") % total.size());
599
600  transaction_guard guard(app.db);
601  std::map<manifest_id, revision_id> mapped;
602  for (std::set<manifest_id>::const_iterator i = heads.begin();
603       i != heads.end(); ++i)
604    {
605      construct_revisions(app, *i, ancestry, mapped);
606    }
607  guard.commit();
608}
609
610
611// i/o stuff
612
613std::string revision_file_name("revision");
614
615namespace
616{
617  namespace syms
618  {
619    std::string const old_revision("old_revision");
620    std::string const new_manifest("new_manifest");
621    std::string const old_manifest("old_manifest");
622  }
623}
624
625
626void
627print_edge(basic_io::printer & printer,
628           edge_entry const & e)
629{
630  basic_io::stanza st;
631  st.push_hex_pair(syms::old_revision, edge_old_revision(e).inner()());
632  st.push_hex_pair(syms::old_manifest, edge_old_manifest(e).inner()());
633  printer.print_stanza(st);
634  print_change_set(printer, edge_changes(e));
635}
636
637
638void
639print_revision(basic_io::printer & printer,
640               revision_set const & rev)
641{
642  basic_io::stanza st;
643  st.push_hex_pair(syms::new_manifest, rev.new_manifest.inner()());
644  printer.print_stanza(st);
645  for (edge_map::const_iterator edge = rev.edges.begin();
646       edge != rev.edges.end(); ++edge)
647    print_edge(printer, *edge);
648}
649
650
651void
652parse_edge(basic_io::parser & parser,
653           edge_map & es)
654{
655  change_set cs;
656  manifest_id old_man;
657  revision_id old_rev;
658  std::string tmp;
659
660  parser.esym(syms::old_revision);
661  parser.hex(tmp);
662  old_rev = revision_id(tmp);
663
664  parser.esym(syms::old_manifest);
665  parser.hex(tmp);
666  old_man = manifest_id(tmp);
667
668  parse_change_set(parser, cs);
669
670  es.insert(std::make_pair(old_rev, std::make_pair(old_man, cs)));
671}
672
673
674void
675parse_revision(basic_io::parser & parser,
676               revision_set & rev)
677{
678  rev.edges.clear();
679  std::string tmp;
680  parser.esym(syms::new_manifest);
681  parser.hex(tmp);
682  rev.new_manifest = manifest_id(tmp);
683  while (parser.symp(syms::old_revision))
684    parse_edge(parser, rev.edges);
685}
686
687void
688read_revision_set(data const & dat,
689                  revision_set & rev)
690{
691  std::istringstream iss(dat());
692  basic_io::input_source src(iss, "revision");
693  basic_io::tokenizer tok(src);
694  basic_io::parser pars(tok);
695  parse_revision(pars, rev);
696}
697
698void
699read_revision_set(revision_data const & dat,
700                  revision_set & rev)
701{
702  data unpacked;
703  unpack(dat.inner(), unpacked);
704  read_revision_set(unpacked, rev);
705}
706
707void
708write_revision_set(revision_set const & rev,
709                   data & dat)
710{
711  std::ostringstream oss;
712  basic_io::printer pr(oss);
713  print_revision(pr, rev);
714  dat = data(oss.str());
715}
716
717void
718write_revision_set(revision_set const & rev,
719                   revision_data & dat)
720{
721  data d;
722  write_revision_set(rev, d);
723  base64< gzip<data> > packed;
724  pack(d, packed);
725  dat = revision_data(packed);
726}
727
728#ifdef BUILD_UNIT_TESTS
729#include "unit_tests.hh"
730#include "sanity.hh"
731
732static void
733revision_test()
734{
735}
736
737void
738add_revision_tests(test_suite * suite)
739{
740  I(suite);
741  suite->add(BOOST_TEST_CASE(&revision_test));
742}
743
744
745#endif // BUILD_UNIT_TESTS
746