1# ============================================================
2# This file attempts to obtain a global lock (for a given
3# region / account combination.
4#
5# This makes one attempt to get the lock and will set the
6# won_lock variable to True or False to indicate whether
7# or not we got the lock.
8#
9# It's expected that this will be executed in a retry loop
10# so that if we don't get the lock we delay then try again.
11#
12# This should only be used in a block with cleanup-lock.yaml
13# included in the always clause to ensure the lock is released.
14#
15# There are several variables that control the locking behaviour:
16# * lock_timeout_seconds
17#     How old a lock must be before it's assumed to be an expired
18#     lock that was not cleaned up by the owner. Any locks older
19#     than this will not prevent a lock being obtained and will
20#     be deleted when a new process obtains the lock.
21# * lock_log_group_prefix
22#     The log_group prefix that represents the lock being obtained.
23#     This must be the same across all processes trying to obtain
24#     the lock.
25# * lock_process_id
26#     A unique identifier of this process. Each process that might
27#     attempt to lock the process must have a different identifier.
28#     This defaults to the resource_prefix which is generally
29#     appropriate.
30# * max_obtain_lock_attempts
31#     How many attempts to make to get the lock before giving up
32#     NB: This is actually done in main.yaml
33# * obtain_lock_delay_seconds:
34#     How long to delay after failing to get the lock before
35#     trying again.
36#     NB: This is actually done in obtain-lock-wrapper.yaml
37#
38# The locking here is based around creating cloudwatch log groups.
39# This resource was chosen because:
40#   A) it's free
41#   B) we have a built in grouping concept because of the hierarchy
42#      that allows us to easily group attempts for the same lock
43#   C) the creation time is tracked and returned which gives us
44#      a mechanism for deterministically picking a winner
45#
46# Each lock is represented by a log group prefix. Each attempt
47# to obtain the lock is a log group of the lock_process_id below
48# that prefix.
49#
50# The winning lock is the one with the earliest creation time.
51#
52# To prevent a hanging lock from permanently hanging the build
53# lock attempts older than the lock timeout are ignored and
54# cleaned up by the next process to win the lock.
55# ============================================================
56
57- name: set up aws connection info
58  set_fact:
59    aws_connection_info: &aws_connection_info
60      aws_access_key: "{{ aws_access_key }}"
61      aws_secret_key: "{{ aws_secret_key }}"
62      security_token: "{{ security_token }}"
63      region: "{{ aws_region }}"
64  no_log: yes
65
66- name: Set lock_attempt_log_group_name
67  set_fact:
68    lock_attempt_log_group_name: "{{ lock_log_group_prefix }}/{{ lock_process_id|default(resource_prefix) }}"
69
70  # Note the overwrite below to ensure that the creation time
71  # is upated. This is important as we calculate expiry relative
72  # the attempt creation.
73  #
74  # Because of this it's imporatnt that we delete the attempt
75  # if we don't get the lock. Otherwise we can get a deadlock
76  # where the stale atttempt from one process wins, but then
77  # because that process updates the creation date it doesn't
78  # consider its self to havewone.
79- name: Create Lock Attempt Log Group
80  cloudwatchlogs_log_group:
81    log_group_name: "{{ lock_attempt_log_group_name }}"
82    state: present
83    overwrite: True
84    <<: *aws_connection_info
85  register: lock_attempt_log_group_result
86
87- name: Get Lock Attempt Lock Groups
88  cloudwatchlogs_log_group_info:
89    log_group_name: "{{ lock_log_group_prefix }}/"
90    <<: *aws_connection_info
91  register: lock_attempt_log_groups
92
93- name: Calculate Expired Lock Attempt Timestamp
94  set_fact:
95    expired_lock_timestamp: "{{ lock_attempt_log_group_result.creation_time - (lock_timeout_seconds * 1000) }}"
96
97- name: Get Expired and Active Lock Attempts
98  set_fact:
99    expired_lock_attempts: "{{ lock_attempt_log_groups.log_groups|selectattr('creation_time', 'lt', expired_lock_timestamp|int)|list }}"
100    active_lock_attempts: "{{ lock_attempt_log_groups.log_groups|selectattr('creation_time', 'ge', expired_lock_timestamp|int)|list }}"
101
102- name: Pick Winning Lock Attempt
103  set_fact:
104    winning_lock_attempt: "{{ active_lock_attempts|sort(attribute='creation_time')|first }}"
105
106- name: Determine if Won Lock
107  set_fact:
108    won_lock: "{{ winning_lock_attempt.log_group_name == lock_attempt_log_group_name }}"
109
110  # Remove the lock attempt if we didn't get the lock. This prevents
111  # our stale lock attempt blocking another process from getting the lock.
112  # See more detailed comment above Create Lock Attempt Log Group
113- name: Remove Failed Lock Attempt Log Group
114  cloudwatchlogs_log_group:
115    log_group_name: "{{ lock_attempt_log_group_name }}"
116    state: absent
117    <<: *aws_connection_info
118  when: "not won_lock|bool"
119
120- name: Delete Expired Lock Attempts
121  cloudwatchlogs_log_group:
122    log_group_name: "{{ item.log_group_name }}"
123    state: absent
124    <<: *aws_connection_info
125  when: "won_lock|bool"
126  loop: "{{ expired_lock_attempts }}"
127