1# frozen_string_literal: true
2
3# ActiveModel::Serialization (https://github.com/rails/rails/blob/v5.0.7/activemodel/lib/active_model/serialization.rb#L184)
4# is simple in that it recursively calls `as_json` on each object to
5# serialize everything. However, for a model like a Project, this can
6# generate a query for every single association, which can add up to tens
7# of thousands of queries and lead to memory bloat.
8#
9# To improve this, we can do several things:
10
11# 1. Use the option tree in http://api.rubyonrails.org/classes/ActiveModel/Serializers/JSON.html
12#    to generate the necessary preload clauses.
13#
14# 2. We observe that a single project has many issues, merge requests,
15#    etc. Instead of serializing everything at once, which could lead to
16#    database timeouts and high memory usage, we take each top-level
17#    association and serialize the data in batches.
18#
19#  For example, we serialize the first 100 issues and preload all of
20#  their associated events, notes, etc. before moving onto the next
21#  batch. When we're done, we serialize merge requests in the same way.
22#  We repeat this pattern for the remaining associations specified in
23#  import_export.yml.
24module Gitlab
25  module ImportExport
26    class FastHashSerializer
27      attr_reader :subject, :tree
28
29      # Usage of this class results in delayed
30      # serialization of relation. The serialization
31      # will be triggered when the `JSON.generate`
32      # is exected.
33      #
34      # This class uses memory-optimised, lazily
35      # initialised, fast to recycle relation
36      # serialization.
37      #
38      # The `JSON.generate` does use `#to_json`,
39      # that returns raw JSON content that is written
40      # directly to file.
41      class JSONBatchRelation
42        include Gitlab::Utils::StrongMemoize
43
44        def initialize(relation, options, preloads)
45          @relation = relation
46          @options = options
47          @preloads = preloads
48        end
49
50        def raw_json
51          strong_memoize(:raw_json) do
52            result = +''
53
54            batch = @relation
55            batch = batch.preload(@preloads) if @preloads
56            batch.each do |item|
57              result.concat(",") unless result.empty?
58              result.concat(item.to_json(@options))
59            end
60
61            result
62          end
63        end
64
65        def to_json(options = {})
66          raw_json
67        end
68
69        def as_json(*)
70          raise NotImplementedError
71        end
72      end
73
74      BATCH_SIZE = 100
75
76      def initialize(subject, tree, batch_size: BATCH_SIZE)
77        @subject = subject
78        @batch_size = batch_size
79        @tree = tree
80      end
81
82      # With the usage of `JSONBatchRelation`, it returns partially
83      # serialized hash which is not easily accessible.
84      # It means you can only manipulate and replace top-level objects.
85      # All future mutations of the hash (such as `fix_project_tree`)
86      # should be aware of that.
87      def execute
88        simple_serialize.merge(serialize_includes)
89      end
90
91      private
92
93      def simple_serialize
94        subject.as_json(
95          tree.merge(include: nil, preloads: nil))
96      end
97
98      def serialize_includes
99        return {} unless includes
100
101        includes
102          .map(&method(:serialize_include_definition))
103          .tap { |entries| entries.compact! }
104          .to_h
105      end
106
107      # definition:
108      # { labels: { includes: ... } }
109      def serialize_include_definition(definition)
110        raise ArgumentError, 'definition needs to be Hash' unless definition.is_a?(Hash)
111        raise ArgumentError, 'definition needs to have exactly one Hash element' unless definition.one?
112
113        key = definition.first.first
114        options = definition.first.second
115
116        record = subject.public_send(key) # rubocop: disable GitlabSecurity/PublicSend
117        return unless record
118
119        serialized_record = serialize_record(key, record, options)
120        return unless serialized_record
121
122        # `#as_json` always returns keys as `strings`
123        [key.to_s, serialized_record]
124      end
125
126      def serialize_record(key, record, options)
127        unless record.respond_to?(:as_json)
128          raise "Invalid type of #{key} is #{record.class}"
129        end
130
131        # no has-many relation
132        unless record.is_a?(ActiveRecord::Relation)
133          return record.as_json(options)
134        end
135
136        data = []
137
138        record.in_batches(of: @batch_size) do |batch| # rubocop:disable Cop/InBatches
139          # order each batch by it's primary key to ensure
140          # consistent and predictable ordering of each exported relation
141          # as additional `WHERE` clauses can impact the order in which data is being
142          # returned by database when no `ORDER` is specified
143          batch = batch.reorder(batch.klass.primary_key)
144
145          data.append(JSONBatchRelation.new(batch, options, preloads[key]).tap(&:raw_json))
146        end
147
148        data
149      end
150
151      def includes
152        tree[:include]
153      end
154
155      def preloads
156        tree[:preload]
157      end
158    end
159  end
160end
161