1# frozen_string_literal: true
2
3module Gitlab
4  module LegacyGithubImport
5    class Importer
6      def self.refmap
7        Gitlab::GithubImport.refmap
8      end
9
10      attr_reader :errors, :project, :repo, :repo_url
11
12      def initialize(project)
13        @project  = project
14        @repo     = project.import_source
15        @repo_url = project.import_url
16        @errors   = []
17        @labels   = {}
18      end
19
20      def client
21        return @client if defined?(@client)
22
23        unless credentials
24          raise Projects::ImportService::Error,
25                "Unable to find project import data credentials for project ID: #{@project.id}"
26        end
27
28        opts = {}
29        # Gitea plan to be GitHub compliant
30        if project.gitea_import?
31          uri = URI.parse(project.import_url)
32          host = "#{uri.scheme}://#{uri.host}:#{uri.port}#{uri.path}".sub(%r{/?[\w-]+/[\w-]+\.git\z}, '')
33          opts = {
34            host: host,
35            api_version: 'v1'
36          }
37        end
38
39        @client = Client.new(credentials[:user], **opts)
40      end
41
42      def execute
43        # The ordering of importing is important here due to the way GitHub structures their data
44        # 1. Labels are required by other items while not having a dependency on anything else
45        # so need to be first
46        # 2. Pull requests must come before issues. Every pull request is also an issue but not
47        # all issues are pull requests. Only the issue entity has labels defined in GitHub. GitLab
48        # doesn't structure data like this so we need to make sure that we've created the MRs
49        # before we attempt to add the labels defined in the GitHub issue for the related, already
50        # imported, pull request
51        import_labels
52        import_milestones
53        import_pull_requests
54        import_issues
55        import_comments(:issues)
56
57        # Gitea doesn't have an API endpoint for pull requests comments
58        unless project.gitea_import?
59          import_comments(:pull_requests)
60        end
61
62        import_wiki
63
64        # Gitea doesn't have a Release API yet
65        # See https://github.com/go-gitea/gitea/issues/330
66        unless project.gitea_import?
67          import_releases
68        end
69
70        handle_errors
71
72        true
73      end
74
75      private
76
77      def credentials
78        return @credentials if defined?(@credentials)
79
80        @credentials = project.import_data ? project.import_data.credentials : nil
81      end
82
83      def handle_errors
84        return unless errors.any?
85
86        project.import_state.update_column(:last_error, {
87          message: 'The remote data could not be fully imported.',
88          errors: errors
89        }.to_json)
90      end
91
92      def import_labels
93        fetch_resources(:labels, repo, per_page: 100) do |labels|
94          labels.each do |raw|
95            gh_label = LabelFormatter.new(project, raw)
96            gh_label.create!
97          rescue StandardError => e
98            errors << { type: :label, url: Gitlab::UrlSanitizer.sanitize(gh_label.url), errors: e.message }
99          end
100        end
101
102        cache_labels!
103      end
104
105      def import_milestones
106        fetch_resources(:milestones, repo, state: :all, per_page: 100) do |milestones|
107          milestones.each do |raw|
108            gh_milestone = MilestoneFormatter.new(project, raw)
109            gh_milestone.create!
110          rescue StandardError => e
111            errors << { type: :milestone, url: Gitlab::UrlSanitizer.sanitize(gh_milestone.url), errors: e.message }
112          end
113        end
114      end
115
116      # rubocop: disable CodeReuse/ActiveRecord
117      def import_issues
118        fetch_resources(:issues, repo, state: :all, sort: :created, direction: :asc, per_page: 100) do |issues|
119          issues.each do |raw|
120            gh_issue = IssueFormatter.new(project, raw, client)
121
122            begin
123              issuable =
124                if gh_issue.pull_request?
125                  MergeRequest.find_by(target_project_id: project.id, iid: gh_issue.number)
126                else
127                  gh_issue.create!
128                end
129
130              apply_labels(issuable, raw)
131            rescue StandardError => e
132              errors << { type: :issue, url: Gitlab::UrlSanitizer.sanitize(gh_issue.url), errors: e.message }
133            end
134          end
135        end
136      end
137      # rubocop: enable CodeReuse/ActiveRecord
138
139      def import_pull_requests
140        fetch_resources(:pull_requests, repo, state: :all, sort: :created, direction: :asc, per_page: 100) do |pull_requests|
141          pull_requests.each do |raw|
142            gh_pull_request = PullRequestFormatter.new(project, raw, client)
143
144            next unless gh_pull_request.valid?
145
146            begin
147              restore_source_branch(gh_pull_request) unless gh_pull_request.source_branch_exists?
148              restore_target_branch(gh_pull_request) unless gh_pull_request.target_branch_exists?
149
150              merge_request = gh_pull_request.create!
151
152              # Gitea doesn't return PR in the Issue API endpoint, so labels must be assigned at this stage
153              if project.gitea_import?
154                apply_labels(merge_request, raw)
155              end
156            rescue StandardError => e
157              errors << { type: :pull_request, url: Gitlab::UrlSanitizer.sanitize(gh_pull_request.url), errors: e.message }
158            ensure
159              clean_up_restored_branches(gh_pull_request)
160            end
161          end
162        end
163
164        project.repository.after_remove_branch
165      end
166
167      def restore_source_branch(pull_request)
168        project.repository.create_branch(pull_request.source_branch_name, pull_request.source_branch_sha)
169      end
170
171      def restore_target_branch(pull_request)
172        project.repository.create_branch(pull_request.target_branch_name, pull_request.target_branch_sha)
173      end
174
175      def remove_branch(name)
176        project.repository.delete_branch(name)
177      rescue Gitlab::Git::Repository::DeleteBranchFailed
178        errors << { type: :remove_branch, name: name }
179      end
180
181      def clean_up_restored_branches(pull_request)
182        return if pull_request.opened?
183
184        remove_branch(pull_request.source_branch_name) unless pull_request.source_branch_exists?
185        remove_branch(pull_request.target_branch_name) unless pull_request.target_branch_exists?
186      end
187
188      def apply_labels(issuable, raw)
189        return unless raw.labels.count > 0
190
191        label_ids = raw.labels
192          .map { |attrs| @labels[attrs.name] }
193          .compact
194
195        issuable.update_attribute(:label_ids, label_ids)
196      end
197
198      # rubocop: disable CodeReuse/ActiveRecord
199      def import_comments(issuable_type)
200        resource_type = "#{issuable_type}_comments".to_sym
201
202        # Two notes here:
203        # 1. We don't have a distinctive attribute for comments (unlike issues iid), so we fetch the last inserted note,
204        # compare it against every comment in the current imported page until we find match, and that's where start importing
205        # 2. GH returns comments for _both_ issues and PRs through issues_comments API, while pull_requests_comments returns
206        # only comments on diffs, so select last note not based on noteable_type but on line_code
207        line_code_is = issuable_type == :pull_requests ? 'NOT NULL' : 'NULL'
208        last_note    = project.notes.where("line_code IS #{line_code_is}").last
209
210        fetch_resources(resource_type, repo, per_page: 100) do |comments|
211          if last_note
212            discard_inserted_comments(comments, last_note)
213            last_note = nil
214          end
215
216          create_comments(comments)
217        end
218      end
219      # rubocop: enable CodeReuse/ActiveRecord
220
221      # rubocop: disable CodeReuse/ActiveRecord
222      def create_comments(comments)
223        ActiveRecord::Base.no_touching do
224          comments.each do |raw|
225            comment = CommentFormatter.new(project, raw, client)
226
227            # GH does not return info about comment's parent, so we guess it by checking its URL!
228            *_, parent, iid = URI(raw.html_url).path.split('/')
229
230            issuable = if parent == 'issues'
231                         Issue.find_by(project_id: project.id, iid: iid)
232                       else
233                         MergeRequest.find_by(target_project_id: project.id, iid: iid)
234                       end
235
236            next unless issuable
237
238            issuable.notes.create!(comment.attributes)
239          rescue StandardError => e
240            errors << { type: :comment, url: Gitlab::UrlSanitizer.sanitize(raw.url), errors: e.message }
241          end
242        end
243      end
244      # rubocop: enable CodeReuse/ActiveRecord
245
246      def discard_inserted_comments(comments, last_note)
247        last_note_attrs = nil
248
249        cut_off_index = comments.find_index do |raw|
250          comment           = CommentFormatter.new(project, raw)
251          comment_attrs     = comment.attributes
252          last_note_attrs ||= last_note.slice(*comment_attrs.keys)
253
254          comment_attrs.with_indifferent_access == last_note_attrs
255        end
256
257        # No matching resource in the collection, which means we got halted right on the end of the last page, so all good
258        return unless cut_off_index
259
260        # Otherwise, remove the resources we've already inserted
261        comments.shift(cut_off_index + 1)
262      end
263
264      def import_wiki
265        return if project.wiki.repository_exists?
266
267        wiki = WikiFormatter.new(project)
268        project.wiki.repository.import_repository(wiki.import_url)
269      rescue ::Gitlab::Git::CommandError => e
270        # GitHub error message when the wiki repo has not been created,
271        # this means that repo has wiki enabled, but have no pages. So,
272        # we can skip the import.
273        if e.message !~ /repository not exported/
274          errors << { type: :wiki, errors: e.message }
275        end
276      end
277
278      def import_releases
279        fetch_resources(:releases, repo, per_page: 100) do |releases|
280          releases.each do |raw|
281            gh_release = ReleaseFormatter.new(project, raw)
282            gh_release.create! if gh_release.valid?
283          rescue StandardError => e
284            errors << { type: :release, url: Gitlab::UrlSanitizer.sanitize(gh_release.url), errors: e.message }
285          end
286        end
287      end
288
289      def cache_labels!
290        project.labels.select(:id, :title).find_each do |label|
291          @labels[label.title] = label.id
292        end
293      end
294
295      def fetch_resources(resource_type, *opts)
296        return if imported?(resource_type)
297
298        opts.last[:page] = current_page(resource_type)
299
300        client.public_send(resource_type, *opts) do |resources| # rubocop:disable GitlabSecurity/PublicSend
301          yield resources
302          increment_page(resource_type)
303        end
304
305        imported!(resource_type)
306      rescue ::Octokit::NotFound => e
307        errors << { type: resource_type, errors: e.message }
308      end
309
310      def imported?(resource_type)
311        Rails.cache.read("#{cache_key_prefix}:#{resource_type}:imported")
312      end
313
314      def imported!(resource_type)
315        Rails.cache.write("#{cache_key_prefix}:#{resource_type}:imported", true, ex: 1.day)
316      end
317
318      def increment_page(resource_type)
319        key = "#{cache_key_prefix}:#{resource_type}:current-page"
320
321        # Rails.cache.increment calls INCRBY directly on the value stored under the key, which is
322        # a serialized ActiveSupport::Cache::Entry, so it will return an error by Redis, hence this ugly work-around
323        page = Rails.cache.read(key)
324        page += 1
325        Rails.cache.write(key, page)
326
327        page
328      end
329
330      def current_page(resource_type)
331        Rails.cache.fetch("#{cache_key_prefix}:#{resource_type}:current-page", ex: 1.day) { 1 }
332      end
333
334      def cache_key_prefix
335        @cache_key_prefix ||= "github-import:#{project.id}"
336      end
337    end
338  end
339end
340