1# frozen_string_literal: true
2
3module Gitlab
4  module Git
5    class Blob
6      include Gitlab::BlobHelper
7      include Gitlab::EncodingHelper
8      extend Gitlab::Git::WrapsGitalyErrors
9
10      # This number is the maximum amount of data that we want to display to
11      # the user. We load as much as we can for encoding detection and LFS
12      # pointer parsing. All other cases where we need full blob data should
13      # use load_all_data!.
14      MAX_DATA_DISPLAY_SIZE = 10.megabytes
15
16      # The number of blobs loaded in a single Gitaly call
17      # When a large number of blobs requested, we'd want to fetch them in
18      # multiple Gitaly calls
19      BATCH_SIZE = 250
20
21      # These limits are used as a heuristic to ignore files which can't be LFS
22      # pointers. The format of these is described in
23      # https://github.com/git-lfs/git-lfs/blob/master/docs/spec.md#the-pointer
24      LFS_POINTER_MIN_SIZE = 120.bytes
25      LFS_POINTER_MAX_SIZE = 200.bytes
26
27      attr_accessor :size, :mode, :id, :commit_id, :loaded_size, :binary
28      attr_writer :name, :path, :data
29
30      def self.gitlab_blob_truncated_true
31        @gitlab_blob_truncated_true ||= ::Gitlab::Metrics.counter(:gitlab_blob_truncated_true, 'blob.truncated? == true')
32      end
33
34      def self.gitlab_blob_truncated_false
35        @gitlab_blob_truncated_false ||= ::Gitlab::Metrics.counter(:gitlab_blob_truncated_false, 'blob.truncated? == false')
36      end
37
38      def self.gitlab_blob_size
39        @gitlab_blob_size ||= ::Gitlab::Metrics.histogram(
40          :gitlab_blob_size,
41          'Gitlab::Git::Blob size',
42          {},
43          [1_000, 5_000, 10_000, 50_000, 100_000, 500_000, 1_000_000]
44        )
45      end
46
47      class << self
48        def find(repository, sha, path, limit: MAX_DATA_DISPLAY_SIZE)
49          tree_entry(repository, sha, path, limit)
50        end
51
52        def tree_entry(repository, sha, path, limit)
53          return unless path
54
55          path = path.sub(%r{\A/*}, '')
56          path = '/' if path.empty?
57          name = File.basename(path)
58
59          # Gitaly will think that setting the limit to 0 means unlimited, while
60          # the client might only need the metadata and thus set the limit to 0.
61          # In this method we'll then set the limit to 1, but clear the byte of data
62          # that we got back so for the outside world it looks like the limit was
63          # actually 0.
64          req_limit = limit == 0 ? 1 : limit
65
66          entry = Gitlab::GitalyClient::CommitService.new(repository).tree_entry(sha, path, req_limit)
67          return unless entry
68
69          entry.data = "" if limit == 0
70
71          case entry.type
72          when :COMMIT
73            new(id: entry.oid, name: name, size: 0, data: '', path: path, commit_id: sha)
74          when :BLOB
75            new(id: entry.oid, name: name, size: entry.size, data: entry.data.dup, mode: entry.mode.to_s(8),
76                path: path, commit_id: sha, binary: binary?(entry.data))
77          end
78        end
79
80        def raw(repository, sha, limit: MAX_DATA_DISPLAY_SIZE)
81          repository.gitaly_blob_client.get_blob(oid: sha, limit: limit)
82        end
83
84        # Returns an array of Blob instances, specified in blob_references as
85        # [[commit_sha, path], [commit_sha, path], ...]. If blob_size_limit < 0 then the
86        # full blob contents are returned. If blob_size_limit >= 0 then each blob will
87        # contain no more than limit bytes in its data attribute.
88        #
89        # Keep in mind that this method may allocate a lot of memory. It is up
90        # to the caller to limit the number of blobs and blob_size_limit.
91        #
92        def batch(repository, blob_references, blob_size_limit: MAX_DATA_DISPLAY_SIZE)
93          blob_references.each_slice(BATCH_SIZE).flat_map do |refs|
94            repository.gitaly_blob_client.get_blobs(refs, blob_size_limit).to_a
95          end
96        end
97
98        # Returns an array of Blob instances just with the metadata, that means
99        # the data attribute has no content.
100        def batch_metadata(repository, blob_references)
101          batch(repository, blob_references, blob_size_limit: 0)
102        end
103
104        # Find LFS blobs given an array of sha ids
105        # Returns array of Gitlab::Git::Blob
106        # Does not guarantee blob data will be set
107        def batch_lfs_pointers(repository, blob_ids)
108          wrapped_gitaly_errors do
109            repository.gitaly_blob_client.batch_lfs_pointers(blob_ids.to_a)
110          end
111        end
112
113        def binary?(data, cache_key: nil)
114          EncodingHelper.detect_libgit2_binary?(data, cache_key: cache_key)
115        end
116
117        def size_could_be_lfs?(size)
118          size.between?(LFS_POINTER_MIN_SIZE, LFS_POINTER_MAX_SIZE)
119        end
120      end
121
122      def initialize(options)
123        %w(id name path size data mode commit_id binary).each do |key|
124          self.__send__("#{key}=", options[key.to_sym]) # rubocop:disable GitlabSecurity/PublicSend
125        end
126
127        # Retain the actual size before it is encoded
128        @loaded_size = @data.bytesize if @data
129        @loaded_all_data = @loaded_size == size
130
131        record_metric_blob_size
132        record_metric_truncated(truncated?)
133      end
134
135      def binary_in_repo?
136        @binary.nil? ? super : @binary == true
137      end
138
139      def data
140        encode! @data
141      end
142
143      # Load all blob data (not just the first MAX_DATA_DISPLAY_SIZE bytes) into
144      # memory as a Ruby string.
145      def load_all_data!(repository)
146        return if @data == '' # don't mess with submodule blobs
147
148        # Even if we return early, recalculate whether this blob is binary in
149        # case a blob was initialized as text but the full data isn't
150        @binary = nil
151
152        return if @loaded_all_data
153
154        @data = repository.gitaly_blob_client.get_blob(oid: id, limit: -1).data
155        @loaded_all_data = true
156        @loaded_size = @data.bytesize
157      end
158
159      def name
160        encode! @name
161      end
162
163      def path
164        encode! @path
165      end
166
167      def truncated?
168        return false unless size && loaded_size
169
170        size > loaded_size
171      end
172
173      # Valid LFS object pointer is a text file consisting of
174      # version
175      # oid
176      # size
177      # see https://github.com/github/git-lfs/blob/v1.1.0/docs/spec.md#the-pointer
178      def lfs_pointer?
179        self.class.size_could_be_lfs?(size) && has_lfs_version_key? && lfs_oid.present? && lfs_size.present?
180      end
181
182      def lfs_oid
183        if has_lfs_version_key?
184          oid = data.match(/(?<=sha256:)([0-9a-f]{64})/)
185          return oid[1] if oid
186        end
187
188        nil
189      end
190
191      def lfs_size
192        if has_lfs_version_key?
193          size = data.match(/(?<=size )([0-9]+)/)
194          return size[1].to_i if size
195        end
196
197        nil
198      end
199
200      def external_storage
201        return unless lfs_pointer?
202
203        :lfs
204      end
205
206      alias_method :external_size, :lfs_size
207
208      private
209
210      def record_metric_blob_size
211        return unless size
212
213        self.class.gitlab_blob_size.observe({}, size)
214      end
215
216      def record_metric_truncated(bool)
217        if bool
218          self.class.gitlab_blob_truncated_true.increment
219        else
220          self.class.gitlab_blob_truncated_false.increment
221        end
222      end
223
224      def has_lfs_version_key?
225        !empty? && text_in_repo? && data.start_with?("version https://git-lfs.github.com/spec")
226      end
227    end
228  end
229end
230
231Gitlab::Git::Blob.singleton_class.prepend Gitlab::Git::RuggedImpl::Blob::ClassMethods
232