iocore/cache/CacheWrite.cc

/** @file

  A brief file description

  @section license License

  Licensed to the Apache Software Foundation (ASF) under one
  or more contributor license agreements.  See the NOTICE file
  distributed with this work for additional information
  regarding copyright ownership.  The ASF licenses this file
  to you under the Apache License, Version 2.0 (the
  "License"); you may not use this file except in compliance
  with the License.  You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
 */

#include "P_Cache.h"

#define UINT_WRAP_LTE(_x, _y) (((_y) - (_x)) < INT_MAX) // exploit overflow
#define UINT_WRAP_GTE(_x, _y) (((_x) - (_y)) < INT_MAX) // exploit overflow
#define UINT_WRAP_LT(_x, _y) (((_x) - (_y)) >= INT_MAX) // exploit overflow

// Given a key, finds the index of the alternate which matches
// used to get the alternate which is actually present in the document
int
get_alternate_index(CacheHTTPInfoVector *cache_vector, CacheKey key)
{
  int alt_count = cache_vector->count();
  CacheHTTPInfo *obj;
  if (!alt_count) {
    return -1;
  }
  for (int i = 0; i < alt_count; i++) {
    obj = cache_vector->get(i);
    if (obj->compare_object_key(&key)) {
      // Debug("cache_key", "Resident alternate key  %X", key.slice32(0));
      return i;
    }
  }
  return -1;
}

// Adds/Deletes alternate to the od->vector (write_vector). If the vector
// is empty, deletes the directory entry pointing to the vector. Each
// CacheVC must write the vector down to disk after making changes. If we
// wait till the last writer, that writer will have the responsibility of
// of writing the vector even if the http state machine aborts.  This
// makes it easier to handle situations where writers abort.
int
CacheVC::updateVector(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
{
  cancel_trigger();
  if (od->reading_vec || od->writing_vec) {
    VC_SCHED_LOCK_RETRY();
  }
  int ret = 0;
  {
    CACHE_TRY_LOCK(lock, vol->mutex, mutex->thread_holding);
    if (!lock.is_locked() || od->writing_vec) {
      VC_SCHED_LOCK_RETRY();
    }

    int vec = alternate.valid();
    if (f.update) {
      // all Update cases. Need to get the alternate index.
      alternate_index = get_alternate_index(write_vector, update_key);
      Debug("cache_update", "updating alternate index %d frags %d", alternate_index,
            alternate_index >= 0 ? write_vector->get(alternate_index)->get_frag_offset_count() : -1);
      // if its an alternate delete
      if (!vec) {
        ink_assert(!total_len);
        if (alternate_index >= 0) {
          write_vector->remove(alternate_index, true);
          alternate_index = CACHE_ALT_REMOVED;
          if (!write_vector->count()) {
            dir_delete(&first_key, vol, &od->first_dir);
          }
        }
        // the alternate is not there any more. somebody might have
        // deleted it. Just close this writer
        if (alternate_index != CACHE_ALT_REMOVED || !write_vector->count()) {
          SET_HANDLER(&CacheVC::openWriteCloseDir);
          return openWriteCloseDir(EVENT_IMMEDIATE, nullptr);
        }
      }
      if (update_key == od->single_doc_key && (total_len || f.allow_empty_doc || !vec)) {
        od->move_resident_alt = false;
      }
    }
    if (cache_config_http_max_alts > 1 && write_vector->count() >= cache_config_http_max_alts && alternate_index < 0) {
      if (od->move_resident_alt && get_alternate_index(write_vector, od->single_doc_key) == 0) {
        od->move_resident_alt = false;
      }
      write_vector->remove(0, true);
    }
    if (vec) {
      /* preserve fragment offset data from old info. This method is
         called iff the update is a header only update so the fragment
         data should remain valid.
      */
      // If we are not in header only updating case. Don't copy fragments.
      if (alternate_index >= 0 &&
          ((total_len == 0 && alternate.get_frag_offset_count() == 0) && !(f.allow_empty_doc && this->vio.nbytes == 0))) {
        alternate.copy_frag_offsets_from(write_vector->get(alternate_index));
      }
      alternate_index = write_vector->insert(&alternate, alternate_index);
    }

    if (od->move_resident_alt && first_buf.get() && !od->has_multiple_writers()) {
      Doc *doc         = reinterpret_cast<Doc *>(first_buf->data());
      int small_doc    = static_cast<int64_t>(doc->data_len()) < static_cast<int64_t>(cache_config_alt_rewrite_max_size);
      int have_res_alt = doc->key == od->single_doc_key;
      // if the new alternate is not written with the vector
      // then move the old one with the vector
      // if its a header only update move the resident alternate
      // with the vector.
      // We are sure that the body of the resident alternate that we are
      // rewriting has not changed and the alternate is not being deleted,
      // since we set od->move_resident_alt  to 0 in that case
      // (in updateVector)
      if (small_doc && have_res_alt && (fragment || (f.update && !total_len))) {
        // for multiple fragment document, we must have done
        // CacheVC:openWriteCloseDataDone
        ink_assert(!fragment || f.data_done);
        od->move_resident_alt  = false;
        f.rewrite_resident_alt = 1;
        write_len              = doc->data_len();
        Debug("cache_update_alt", "rewriting resident alt size: %d key: %X, first_key: %X", write_len, doc->key.slice32(0),
              first_key.slice32(0));
      }
    }
    header_len      = write_vector->marshal_length();
    od->writing_vec = true;
    f.use_first_key = 1;
    SET_HANDLER(&CacheVC::openWriteCloseHeadDone);
    ret = do_write_call();
  }
  if (ret == EVENT_RETURN) {
    return handleEvent(AIO_EVENT_DONE, nullptr);
  }
  return ret;
}
/*
   The following fields of the CacheVC are used when writing down a fragment.
   Make sure that each of the fields is set to a valid value before calling
   this function
   - frag_type. Checked to see if a vector needs to be marshalled.
   - f.use_first_key. To decide if the vector should be marshalled and to set
     the doc->key to the appropriate key (first_key or earliest_key)
   - f.evac_vector. If set, the writer is pushed in the beginning of the
     agg queue. And if !f.evac_vector && !f.update the alternate->object_size
     is set to vc->total_len
   - f.readers.  If set, assumes that this is an evacuation, so the write
     is not aborted even if vol->agg_todo_size > agg_write_backlog
   - f.evacuator. If this is an evacuation.
   - f.rewrite_resident_alt. The resident alternate is rewritten.
   - f.update. Used only if the write_vector needs to be written to disk.
     Used to set the length of the alternate to total_len.
   - write_vector. Used only if frag_type == CACHE_FRAG_TYPE_HTTP &&
     (f.use_fist_key || f.evac_vector) is set. Write_vector is written to disk
   - alternate_index. Used only if write_vector needs to be written to disk.
     Used to find out the VC's alternate in the write_vector and set its
     length to tatal_len.
   - write_len. The number of bytes for this fragment.
   - total_len. The total number of bytes for the document so far.
     Doc->total_len and alternate's total len is set to this value.
   - first_key. Doc's first_key is set to this value.
   - pin_in_cache. Doc's pinned value is set to this + Thread::get_hrtime().
   - earliest_key. If f.use_first_key, Doc's key is set to this value.
   - key. If !f.use_first_key, Doc's key is set to this value.
   - blocks. Used only if write_len is set. Data to be written
   - offset. Used only if write_len is set. offset into the block to copy
     the data from.
   - buf. Used only if f.evacuator is set. Should point to the old document.
   The functions sets the length, offset, pinned, head and phase of vc->dir.
   */

int
CacheVC::handleWrite(int event, Event * /* e ATS_UNUSED */)
{
  // plain write case
  ink_assert(!trigger);
  frag_len = 0;

  set_agg_write_in_progress();
  POP_HANDLER;
  agg_len = vol->round_to_approx_size(write_len + header_len + frag_len + sizeof(Doc));
  vol->agg_todo_size += agg_len;
  bool agg_error = (agg_len > AGG_SIZE || header_len + sizeof(Doc) > MAX_FRAG_SIZE ||
                    (!f.readers && (vol->agg_todo_size > cache_config_agg_write_backlog + AGG_SIZE) && write_len));
#ifdef CACHE_AGG_FAIL_RATE
  agg_error = agg_error || ((uint32_t)mutex->thread_holding->generator.random() < (uint32_t)(UINT_MAX * CACHE_AGG_FAIL_RATE));
#endif
  bool max_doc_error = (cache_config_max_doc_size && (cache_config_max_doc_size < vio.ndone ||
                                                      (vio.nbytes != INT64_MAX && (cache_config_max_doc_size < vio.nbytes))));

  if (agg_error || max_doc_error) {
    CACHE_INCREMENT_DYN_STAT(cache_write_backlog_failure_stat);
    CACHE_INCREMENT_DYN_STAT(base_stat + CACHE_STAT_FAILURE);
    vol->agg_todo_size -= agg_len;
    io.aio_result = AIO_SOFT_FAILURE;
    if (event == EVENT_CALL) {
      return EVENT_RETURN;
    }
    return handleEvent(AIO_EVENT_DONE, nullptr);
  }
  ink_assert(agg_len <= AGG_SIZE);
  if (f.evac_vector) {
    vol->agg.push(this);
  } else {
    vol->agg.enqueue(this);
  }
  if (!vol->is_io_in_progress()) {
    return vol->aggWrite(event, this);
  }
  return EVENT_CONT;
}

static char *
iobufferblock_memcpy(char *p, int len, IOBufferBlock *ab, int offset)
{
  IOBufferBlock *b = ab;
  while (b && len >= 0) {
    char *start   = b->_start;
    char *end     = b->_end;
    int max_bytes = end - start;
    max_bytes -= offset;
    if (max_bytes <= 0) {
      offset = -max_bytes;
      b      = b->next.get();
      continue;
    }
    int bytes = len;
    if (bytes >= max_bytes) {
      bytes = max_bytes;
    }
    ::memcpy(p, start + offset, bytes);
    p += bytes;
    len -= bytes;
    b      = b->next.get();
    offset = 0;
  }
  return p;
}

EvacuationBlock *
Vol::force_evacuate_head(Dir *evac_dir, int pinned)
{
  // build an evacuation block for the object
  EvacuationBlock *b = evacuation_block_exists(evac_dir, this);
  // if we have already started evacuating this document, its too late
  // to evacuate the head...bad luck
  if (b && b->f.done) {
    return b;
  }

  if (!b) {
    b      = new_EvacuationBlock(mutex->thread_holding);
    b->dir = *evac_dir;
    DDebug("cache_evac", "force: %d, %d", (int)dir_offset(evac_dir), (int)dir_phase(evac_dir));
    evacuate[dir_evac_bucket(evac_dir)].push(b);
  }
  b->f.pinned        = pinned;
  b->f.evacuate_head = 1;
  b->evac_frags.key  = zero_key; // ensure that the block gets
  // evacuated no matter what
  b->readers = 0; // ensure that the block does not disappear
  return b;
}

void
Vol::scan_for_pinned_documents()
{
  if (cache_config_permit_pinning) {
    // we can't evacuate anything between header->write_pos and
    // header->write_pos + AGG_SIZE.
    int ps                = this->offset_to_vol_offset(header->write_pos + AGG_SIZE);
    int pe                = this->offset_to_vol_offset(header->write_pos + 2 * EVACUATION_SIZE + (len / PIN_SCAN_EVERY));
    int vol_end_offset    = this->offset_to_vol_offset(len + skip);
    int before_end_of_vol = pe < vol_end_offset;
    DDebug("cache_evac", "scan %d %d", ps, pe);
    for (int i = 0; i < this->direntries(); i++) {
      // is it a valid pinned object?
      if (!dir_is_empty(&dir[i]) && dir_pinned(&dir[i]) && dir_head(&dir[i])) {
        // select objects only within this PIN_SCAN region
        int o = dir_offset(&dir[i]);
        if (dir_phase(&dir[i]) == header->phase) {
          if (before_end_of_vol || o >= (pe - vol_end_offset)) {
            continue;
          }
        } else {
          if (o < ps || o >= pe) {
            continue;
          }
        }
        force_evacuate_head(&dir[i], 1);
        //      DDebug("cache_evac", "scan pinned at offset %d %d %d %d %d %d",
        //            (int)dir_offset(&b->dir), ps, o , pe, i, (int)b->f.done);
      }
    }
  }
}

/* NOTE:: This state can be called by an AIO thread, so DON'T DON'T
   DON'T schedule any events on this thread using VC_SCHED_XXX or
   mutex->thread_holding->schedule_xxx_local(). ALWAYS use
   eventProcessor.schedule_xxx().
   */
int
Vol::aggWriteDone(int event, Event *e)
{
  cancel_trigger();

  // ensure we have the cacheDirSync lock if we intend to call it later
  // retaking the current mutex recursively is a NOOP
  CACHE_TRY_LOCK(lock, dir_sync_waiting ? cacheDirSync->mutex : mutex, mutex->thread_holding);
  if (!lock.is_locked()) {
    eventProcessor.schedule_in(this, HRTIME_MSECONDS(cache_config_mutex_retry_delay));
    return EVENT_CONT;
  }
  if (io.ok()) {
    header->last_write_pos = header->write_pos;
    header->write_pos += io.aiocb.aio_nbytes;
    ink_assert(header->write_pos >= start);
    DDebug("cache_agg", "Dir %s, Write: %" PRIu64 ", last Write: %" PRIu64 "", hash_text.get(), header->write_pos,
           header->last_write_pos);
    ink_assert(header->write_pos == header->agg_pos);
    if (header->write_pos + EVACUATION_SIZE > scan_pos) {
      periodic_scan();
    }
    agg_buf_pos = 0;
    header->write_serial++;
  } else {
    // delete all the directory entries that we inserted
    // for fragments is this aggregation buffer
    Debug("cache_disk_error", "Write error on disk %s\n \
              write range : [%" PRIu64 " - %" PRIu64 " bytes]  [%" PRIu64 " - %" PRIu64 " blocks] \n",
          hash_text.get(), (uint64_t)io.aiocb.aio_offset, (uint64_t)io.aiocb.aio_offset + io.aiocb.aio_nbytes,
          (uint64_t)io.aiocb.aio_offset / CACHE_BLOCK_SIZE,
          (uint64_t)(io.aiocb.aio_offset + io.aiocb.aio_nbytes) / CACHE_BLOCK_SIZE);
    Dir del_dir;
    dir_clear(&del_dir);
    for (int done = 0; done < agg_buf_pos;) {
      Doc *doc = reinterpret_cast<Doc *>(agg_buffer + done);
      dir_set_offset(&del_dir, header->write_pos + done);
      dir_delete(&doc->key, this, &del_dir);
      done += round_to_approx_size(doc->len);
    }
    agg_buf_pos = 0;
  }
  set_io_not_in_progress();
  // callback ready sync CacheVCs
  CacheVC *c = nullptr;
  while ((c = sync.dequeue())) {
    if (UINT_WRAP_LTE(c->write_serial + 2, header->write_serial)) {
      eventProcessor.schedule_imm(c, ET_CALL, AIO_EVENT_DONE);
    } else {
      sync.push(c); // put it back on the front
      break;
    }
  }
  if (dir_sync_waiting) {
    dir_sync_waiting = false;
    cacheDirSync->handleEvent(EVENT_IMMEDIATE, nullptr);
  }
  if (agg.head || sync.head) {
    return aggWrite(event, e);
  }
  return EVENT_CONT;
}

CacheVC *
new_DocEvacuator(int nbytes, Vol *vol)
{
  CacheVC *c        = new_CacheVC(vol);
  ProxyMutex *mutex = vol->mutex.get();
  c->base_stat      = cache_evacuate_active_stat;
  CACHE_INCREMENT_DYN_STAT(c->base_stat + CACHE_STAT_ACTIVE);
  c->buf          = new_IOBufferData(iobuffer_size_to_index(nbytes, MAX_BUFFER_SIZE_INDEX), MEMALIGNED);
  c->vol          = vol;
  c->f.evacuator  = 1;
  c->earliest_key = zero_key;
  SET_CONTINUATION_HANDLER(c, &CacheVC::evacuateDocDone);
  return c;
}

int
CacheVC::evacuateReadHead(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
{
  // The evacuator vc shares the lock with the volition mutex
  ink_assert(vol->mutex->thread_holding == this_ethread());
  cancel_trigger();
  Doc *doc                     = reinterpret_cast<Doc *>(buf->data());
  CacheHTTPInfo *alternate_tmp = nullptr;
  if (!io.ok()) {
    goto Ldone;
  }
  // a directory entry which is no longer valid may have been overwritten
  if (!dir_valid(vol, &dir)) {
    last_collision = nullptr;
    goto Lcollision;
  }
  if (doc->magic != DOC_MAGIC || !(doc->first_key == first_key)) {
    goto Lcollision;
  }
  alternate_tmp = nullptr;
  if (doc->doc_type == CACHE_FRAG_TYPE_HTTP && doc->hlen) {
    // its an http document
    if (this->load_http_info(&vector, doc) != doc->hlen) {
      Note("bad vector detected during evacuation");
      goto Ldone;
    }
    alternate_index = get_alternate_index(&vector, earliest_key);
    if (alternate_index < 0) {
      goto Ldone;
    }
    alternate_tmp = vector.get(alternate_index);
    doc_len       = alternate_tmp->object_size_get();
    Debug("cache_evac", "evacuateReadHead http earliest %X first: %X len: %" PRId64, first_key.slice32(0), earliest_key.slice32(0),
          doc_len);
  } else {
    // non-http document
    CacheKey next_key;
    next_CacheKey(&next_key, &doc->key);
    if (!(next_key == earliest_key)) {
      goto Ldone;
    }
    doc_len = doc->total_len;
    DDebug("cache_evac", "evacuateReadHead non-http earliest %X first: %X len: %" PRId64, first_key.slice32(0),
           earliest_key.slice32(0), doc_len);
  }
  if (doc_len == total_len) {
    // the whole document has been evacuated. Insert the directory
    // entry in the directory.
    dir_lookaside_fixup(&earliest_key, vol);
    return free_CacheVC(this);
  }
  return EVENT_CONT;
Lcollision:
  if (dir_probe(&first_key, vol, &dir, &last_collision)) {
    int ret = do_read_call(&first_key);
    if (ret == EVENT_RETURN) {
      return handleEvent(AIO_EVENT_DONE, nullptr);
    }
    return ret;
  }
Ldone:
  dir_lookaside_remove(&earliest_key, vol);
  return free_CacheVC(this);
}

int
CacheVC::evacuateDocDone(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
{
  ink_assert(vol->mutex->thread_holding == this_ethread());
  Doc *doc = reinterpret_cast<Doc *>(buf->data());
  DDebug("cache_evac", "evacuateDocDone %X o %d p %d new_o %d new_p %d", (int)key.slice32(0), (int)dir_offset(&overwrite_dir),
         (int)dir_phase(&overwrite_dir), (int)dir_offset(&dir), (int)dir_phase(&dir));
  int i = dir_evac_bucket(&overwrite_dir);
  // nasty beeping race condition, need to have the EvacuationBlock here
  EvacuationBlock *b = vol->evacuate[i].head;
  for (; b; b = b->link.next) {
    if (dir_offset(&b->dir) == dir_offset(&overwrite_dir)) {
      // If the document is single fragment (although not tied to the vector),
      // then we don't have to put the directory entry in the lookaside
      // buffer. But, we have no way of finding out if the document is
      // single fragment. doc->single_fragment() can be true for a multiple
      // fragment document since total_len and doc->len could be equal at
      // the time we write the fragment down. To be on the safe side, we
      // only overwrite the entry in the directory if its not a head.
      if (!dir_head(&overwrite_dir)) {
        // find the earliest key
        EvacuationKey *evac = &b->evac_frags;
        for (; evac && !(evac->key == doc->key); evac = evac->link.next) {
          ;
        }
        ink_assert(evac);
        if (!evac) {
          break;
        }
        if (evac->earliest_key.fold()) {
          DDebug("cache_evac", "evacdocdone: evacuating key %X earliest %X", evac->key.slice32(0), evac->earliest_key.slice32(0));
          EvacuationBlock *eblock = nullptr;
          Dir dir_tmp;
          dir_lookaside_probe(&evac->earliest_key, vol, &dir_tmp, &eblock);
          if (eblock) {
            CacheVC *earliest_evac = eblock->earliest_evacuator;
            earliest_evac->total_len += doc->data_len();
            if (earliest_evac->total_len == earliest_evac->doc_len) {
              dir_lookaside_fixup(&evac->earliest_key, vol);
              free_CacheVC(earliest_evac);
            }
          }
        }
        dir_overwrite(&doc->key, vol, &dir, &overwrite_dir);
      }
      // if the tag in the overwrite_dir matches the first_key in the
      // document, then it has to be the vector. We guarantee that
      // the first_key and the earliest_key will never collide (see
      // Cache::open_write). Once we know its the vector, we can
      // safely overwrite the first_key in the directory.
      if (dir_head(&overwrite_dir) && b->f.evacuate_head) {
        DDebug("cache_evac", "evacuateDocDone evacuate_head %X %X hlen %d offset %d", (int)key.slice32(0), (int)doc->key.slice32(0),
               doc->hlen, (int)dir_offset(&overwrite_dir));

        if (dir_compare_tag(&overwrite_dir, &doc->first_key)) {
          OpenDirEntry *cod;
          DDebug("cache_evac", "evacuating vector: %X %d", (int)doc->first_key.slice32(0), (int)dir_offset(&overwrite_dir));
          if ((cod = vol->open_read(&doc->first_key))) {
            // writer  exists
            DDebug("cache_evac", "overwriting the open directory %X %d %d", (int)doc->first_key.slice32(0),
                   (int)dir_offset(&cod->first_dir), (int)dir_offset(&dir));
            cod->first_dir = dir;
          }
          if (dir_overwrite(&doc->first_key, vol, &dir, &overwrite_dir)) {
            int64_t o = dir_offset(&overwrite_dir), n = dir_offset(&dir);
            vol->ram_cache->fixup(&doc->first_key, static_cast<uint32_t>(o >> 32), static_cast<uint32_t>(o),
                                  static_cast<uint32_t>(n >> 32), static_cast<uint32_t>(n));
          }
        } else {
          DDebug("cache_evac", "evacuating earliest: %X %d", (int)doc->key.slice32(0), (int)dir_offset(&overwrite_dir));
          ink_assert(dir_compare_tag(&overwrite_dir, &doc->key));
          ink_assert(b->earliest_evacuator == this);
          total_len += doc->data_len();
          first_key    = doc->first_key;
          earliest_dir = dir;
          if (dir_probe(&first_key, vol, &dir, &last_collision) > 0) {
            dir_lookaside_insert(b, vol, &earliest_dir);
            // read the vector
            SET_HANDLER(&CacheVC::evacuateReadHead);
            int ret = do_read_call(&first_key);
            if (ret == EVENT_RETURN) {
              return handleEvent(AIO_EVENT_DONE, nullptr);
            }
            return ret;
          }
        }
      }
      break;
    }
  }
  return free_CacheVC(this);
}

static int
evacuate_fragments(CacheKey *key, CacheKey *earliest_key, int force, Vol *vol)
{
  Dir dir, *last_collision = nullptr;
  int i = 0;
  while (dir_probe(key, vol, &dir, &last_collision)) {
    // next fragment cannot be a head...if it is, it must have been a
    // directory collision.
    if (dir_head(&dir)) {
      continue;
    }
    EvacuationBlock *b = evacuation_block_exists(&dir, vol);
    if (!b) {
      b                          = new_EvacuationBlock(vol->mutex->thread_holding);
      b->dir                     = dir;
      b->evac_frags.key          = *key;
      b->evac_frags.earliest_key = *earliest_key;
      vol->evacuate[dir_evac_bucket(&dir)].push(b);
      i++;
    } else {
      ink_assert(dir_offset(&dir) == dir_offset(&b->dir));
      ink_assert(dir_phase(&dir) == dir_phase(&b->dir));
      EvacuationKey *evac_frag = evacuationKeyAllocator.alloc();
      evac_frag->key           = *key;
      evac_frag->earliest_key  = *earliest_key;
      evac_frag->link.next     = b->evac_frags.link.next;
      b->evac_frags.link.next  = evac_frag;
    }
    if (force) {
      b->readers = 0;
    }
    DDebug("cache_evac", "next fragment %X Earliest: %X offset %d phase %d force %d", (int)key->slice32(0),
           (int)earliest_key->slice32(0), (int)dir_offset(&dir), (int)dir_phase(&dir), force);
  }
  return i;
}

int
Vol::evacuateWrite(CacheVC *evacuator, int event, Event *e)
{
  // push to front of aggregation write list, so it is written first

  evacuator->agg_len = round_to_approx_size((reinterpret_cast<Doc *>(evacuator->buf->data()))->len);
  agg_todo_size += evacuator->agg_len;
  /* insert the evacuator after all the other evacuators */
  CacheVC *cur   = static_cast<CacheVC *>(agg.head);
  CacheVC *after = nullptr;
  for (; cur && cur->f.evacuator; cur = (CacheVC *)cur->link.next) {
    after = cur;
  }
  ink_assert(evacuator->agg_len <= AGG_SIZE);
  agg.insert(evacuator, after);
  return aggWrite(event, e);
}

int
Vol::evacuateDocReadDone(int event, Event *e)
{
  cancel_trigger();
  if (event != AIO_EVENT_DONE) {
    return EVENT_DONE;
  }
  ink_assert(is_io_in_progress());
  set_io_not_in_progress();
  ink_assert(mutex->thread_holding == this_ethread());
  Doc *doc = reinterpret_cast<Doc *>(doc_evacuator->buf->data());
  CacheKey next_key;
  EvacuationBlock *b = nullptr;
  if (doc->magic != DOC_MAGIC) {
    Debug("cache_evac", "DOC magic: %X %d", (int)dir_tag(&doc_evacuator->overwrite_dir),
          (int)dir_offset(&doc_evacuator->overwrite_dir));
    ink_assert(doc->magic == DOC_MAGIC);
    goto Ldone;
  }
  DDebug("cache_evac", "evacuateDocReadDone %X offset %d", (int)doc->key.slice32(0),
         (int)dir_offset(&doc_evacuator->overwrite_dir));

  b = evacuate[dir_evac_bucket(&doc_evacuator->overwrite_dir)].head;
  while (b) {
    if (dir_offset(&b->dir) == dir_offset(&doc_evacuator->overwrite_dir)) {
      break;
    }
    b = b->link.next;
  }
  if (!b) {
    goto Ldone;
  }
  if ((b->f.pinned && !b->readers) && doc->pinned < static_cast<uint32_t>(Thread::get_hrtime() / HRTIME_SECOND)) {
    goto Ldone;
  }

  if (dir_head(&b->dir) && b->f.evacuate_head) {
    ink_assert(!b->evac_frags.key.fold());
    // if its a head (vector), evacuation is real simple...we just
    // need to write this vector down and overwrite the directory entry.
    if (dir_compare_tag(&b->dir, &doc->first_key)) {
      doc_evacuator->key = doc->first_key;
      b->evac_frags.key  = doc->first_key;
      DDebug("cache_evac", "evacuating vector %X offset %d", (int)doc->first_key.slice32(0),
             (int)dir_offset(&doc_evacuator->overwrite_dir));
      b->f.unused = 57;
    } else {
      // if its an earliest fragment (alternate) evacuation, things get
      // a little tricky. We have to propagate the earliest key to the next
      // fragments for this alternate. The last fragment to be evacuated
      // fixes up the lookaside buffer.
      doc_evacuator->key          = doc->key;
      doc_evacuator->earliest_key = doc->key;
      b->evac_frags.key           = doc->key;
      b->evac_frags.earliest_key  = doc->key;
      b->earliest_evacuator       = doc_evacuator;
      DDebug("cache_evac", "evacuating earliest %X %X evac: %p offset: %d", (int)b->evac_frags.key.slice32(0),
             (int)doc->key.slice32(0), doc_evacuator, (int)dir_offset(&doc_evacuator->overwrite_dir));
      b->f.unused = 67;
    }
  } else {
    // find which key matches the document
    EvacuationKey *ek = &b->evac_frags;
    for (; ek && !(ek->key == doc->key); ek = ek->link.next) {
      ;
    }
    if (!ek) {
      b->f.unused = 77;
      goto Ldone;
    }
    doc_evacuator->key          = ek->key;
    doc_evacuator->earliest_key = ek->earliest_key;
    DDebug("cache_evac", "evacuateDocReadDone key: %X earliest: %X", (int)ek->key.slice32(0), (int)ek->earliest_key.slice32(0));
    b->f.unused = 87;
  }
  // if the tag in the c->dir does match the first_key in the
  // document, then it has to be the earliest fragment. We guarantee that
  // the first_key and the earliest_key will never collide (see
  // Cache::open_write).
  if (!dir_head(&b->dir) || !dir_compare_tag(&b->dir, &doc->first_key)) {
    next_CacheKey(&next_key, &doc->key);
    evacuate_fragments(&next_key, &doc_evacuator->earliest_key, !b->readers, this);
  }
  return evacuateWrite(doc_evacuator, event, e);
Ldone:
  free_CacheVC(doc_evacuator);
  doc_evacuator = nullptr;
  return aggWrite(event, e);
}

int
Vol::evac_range(off_t low, off_t high, int evac_phase)
{
  off_t s = this->offset_to_vol_offset(low);
  off_t e = this->offset_to_vol_offset(high);
  int si  = dir_offset_evac_bucket(s);
  int ei  = dir_offset_evac_bucket(e);

  for (int i = si; i <= ei; i++) {
    EvacuationBlock *b     = evacuate[i].head;
    EvacuationBlock *first = nullptr;
    int64_t first_offset   = INT64_MAX;
    for (; b; b = b->link.next) {
      int64_t offset = dir_offset(&b->dir);
      int phase      = dir_phase(&b->dir);
      if (offset >= s && offset < e && !b->f.done && phase == evac_phase) {
        if (offset < first_offset) {
          first        = b;
          first_offset = offset;
        }
      }
    }
    if (first) {
      first->f.done       = 1;
      io.aiocb.aio_fildes = fd;
      io.aiocb.aio_nbytes = dir_approx_size(&first->dir);
      io.aiocb.aio_offset = this->vol_offset(&first->dir);
      if (static_cast<off_t>(io.aiocb.aio_offset + io.aiocb.aio_nbytes) > static_cast<off_t>(skip + len)) {
        io.aiocb.aio_nbytes = skip + len - io.aiocb.aio_offset;
      }
      doc_evacuator                = new_DocEvacuator(io.aiocb.aio_nbytes, this);
      doc_evacuator->overwrite_dir = first->dir;

      io.aiocb.aio_buf = doc_evacuator->buf->data();
      io.action        = this;
      io.thread        = AIO_CALLBACK_THREAD_ANY;
      DDebug("cache_evac", "evac_range evacuating %X %d", (int)dir_tag(&first->dir), (int)dir_offset(&first->dir));
      SET_HANDLER(&Vol::evacuateDocReadDone);
      ink_assert(ink_aio_read(&io) >= 0);
      return -1;
    }
  }
  return 0;
}

static int
agg_copy(char *p, CacheVC *vc)
{
  Vol *vol = vc->vol;
  off_t o  = vol->header->write_pos + vol->agg_buf_pos;

  if (!vc->f.evacuator) {
    Doc *doc                   = reinterpret_cast<Doc *>(p);
    IOBufferBlock *res_alt_blk = nullptr;

    uint32_t len = vc->write_len + vc->header_len + vc->frag_len + sizeof(Doc);
    ink_assert(vc->frag_type != CACHE_FRAG_TYPE_HTTP || len != sizeof(Doc));
    ink_assert(vol->round_to_approx_size(len) == vc->agg_len);
    // update copy of directory entry for this document
    dir_set_approx_size(&vc->dir, vc->agg_len);
    dir_set_offset(&vc->dir, vol->offset_to_vol_offset(o));
    ink_assert(vol->vol_offset(&vc->dir) < (vol->skip + vol->len));
    dir_set_phase(&vc->dir, vol->header->phase);

    // fill in document header
    doc->magic       = DOC_MAGIC;
    doc->len         = len;
    doc->hlen        = vc->header_len;
    doc->doc_type    = vc->frag_type;
    doc->v_major     = CACHE_DB_MAJOR_VERSION;
    doc->v_minor     = CACHE_DB_MINOR_VERSION;
    doc->unused      = 0; // force this for forward compatibility.
    doc->total_len   = vc->total_len;
    doc->first_key   = vc->first_key;
    doc->sync_serial = vol->header->sync_serial;
    vc->write_serial = doc->write_serial = vol->header->write_serial;
    doc->checksum                        = DOC_NO_CHECKSUM;
    if (vc->pin_in_cache) {
      dir_set_pinned(&vc->dir, 1);
      doc->pinned = static_cast<uint32_t>(Thread::get_hrtime() / HRTIME_SECOND) + vc->pin_in_cache;
    } else {
      dir_set_pinned(&vc->dir, 0);
      doc->pinned = 0;
    }

    if (vc->f.use_first_key) {
      if (doc->data_len() || vc->f.allow_empty_doc) {
        doc->key = vc->earliest_key;
      } else { // the vector is being written by itself
        if (vc->earliest_key == zero_key) {
          do {
            rand_CacheKey(&doc->key, vc->vol->mutex);
          } while (DIR_MASK_TAG(doc->key.slice32(2)) == DIR_MASK_TAG(vc->first_key.slice32(2)));
        } else {
          prev_CacheKey(&doc->key, &vc->earliest_key);
        }
      }
      dir_set_head(&vc->dir, true);
    } else {
      doc->key = vc->key;
      dir_set_head(&vc->dir, !vc->fragment);
    }

    if (vc->f.rewrite_resident_alt) {
      ink_assert(vc->f.use_first_key);
      Doc *res_doc   = reinterpret_cast<Doc *>(vc->first_buf->data());
      res_alt_blk    = new_IOBufferBlock(vc->first_buf, res_doc->data_len(), sizeof(Doc) + res_doc->hlen);
      doc->key       = res_doc->key;
      doc->total_len = res_doc->data_len();
    }
    // update the new_info object_key, and total_len and dirinfo
    if (vc->header_len) {
      ink_assert(vc->f.use_first_key);
      if (vc->frag_type == CACHE_FRAG_TYPE_HTTP) {
        ink_assert(vc->write_vector->count() > 0);
        if (!vc->f.update && !vc->f.evac_vector) {
          ink_assert(!(vc->first_key == zero_key));
          CacheHTTPInfo *http_info = vc->write_vector->get(vc->alternate_index);
          http_info->object_size_set(vc->total_len);
        }
        // update + data_written =>  Update case (b)
        // need to change the old alternate's object length
        if (vc->f.update && vc->total_len) {
          CacheHTTPInfo *http_info = vc->write_vector->get(vc->alternate_index);
          http_info->object_size_set(vc->total_len);
        }
        ink_assert(!(((uintptr_t)&doc->hdr()[0]) & HDR_PTR_ALIGNMENT_MASK));
        ink_assert(vc->header_len == vc->write_vector->marshal(doc->hdr(), vc->header_len));
      } else {
        memcpy(doc->hdr(), vc->header_to_write, vc->header_len);
      }
      // the single fragment flag is not used in the write call.
      // putting it in for completeness.
      vc->f.single_fragment = doc->single_fragment();
    }
    // move data
    if (vc->write_len) {
      {
        ProxyMutex *mutex ATS_UNUSED = vc->vol->mutex.get();
        ink_assert(mutex->thread_holding == this_ethread());
        CACHE_DEBUG_SUM_DYN_STAT(cache_write_bytes_stat, vc->write_len);
      }
      if (vc->f.rewrite_resident_alt) {
        iobufferblock_memcpy(doc->data(), vc->write_len, res_alt_blk, 0);
      } else {
        iobufferblock_memcpy(doc->data(), vc->write_len, vc->blocks.get(), vc->offset);
      }
#ifdef VERIFY_JTEST_DATA
      if (f.use_first_key && header_len) {
        int ib = 0, xd = 0;
        char xx[500];
        new_info.request_get().url_get().print(xx, 500, &ib, &xd);
        char *x = xx;
        for (int q = 0; q < 3; q++)
          x = strchr(x + 1, '/');
        ink_assert(!memcmp(doc->hdr(), x, ib - (x - xx)));
      }
#endif
    }
    if (cache_config_enable_checksum) {
      doc->checksum = 0;
      for (char *b = doc->hdr(); b < reinterpret_cast<char *>(doc) + doc->len; b++) {
        doc->checksum += *b;
      }
    }
    if (vc->frag_type == CACHE_FRAG_TYPE_HTTP && vc->f.single_fragment) {
      ink_assert(doc->hlen);
    }

    if (res_alt_blk) {
      res_alt_blk->free();
    }

    return vc->agg_len;
  } else {
    // for evacuated documents, copy the data, and update directory
    Doc *doc = reinterpret_cast<Doc *>(vc->buf->data());
    int l    = vc->vol->round_to_approx_size(doc->len);
    {
      ProxyMutex *mutex ATS_UNUSED = vc->vol->mutex.get();
      ink_assert(mutex->thread_holding == this_ethread());
      CACHE_DEBUG_INCREMENT_DYN_STAT(cache_gc_frags_evacuated_stat);
      CACHE_DEBUG_SUM_DYN_STAT(cache_gc_bytes_evacuated_stat, l);
    }

    doc->sync_serial  = vc->vol->header->sync_serial;
    doc->write_serial = vc->vol->header->write_serial;

    memcpy(p, doc, doc->len);

    vc->dir = vc->overwrite_dir;
    dir_set_offset(&vc->dir, vc->vol->offset_to_vol_offset(o));
    dir_set_phase(&vc->dir, vc->vol->header->phase);
    return l;
  }
}

inline void
Vol::evacuate_cleanup_blocks(int i)
{
  EvacuationBlock *b = evacuate[i].head;
  while (b) {
    if (b->f.done && ((header->phase != dir_phase(&b->dir) && header->write_pos > this->vol_offset(&b->dir)) ||
                      (header->phase == dir_phase(&b->dir) && header->write_pos <= this->vol_offset(&b->dir)))) {
      EvacuationBlock *x = b;
      DDebug("cache_evac", "evacuate cleanup free %X offset %d", (int)b->evac_frags.key.slice32(0), (int)dir_offset(&b->dir));
      b = b->link.next;
      evacuate[i].remove(x);
      free_EvacuationBlock(x, mutex->thread_holding);
      continue;
    }
    b = b->link.next;
  }
}

void
Vol::evacuate_cleanup()
{
  int64_t eo = ((header->write_pos - start) / CACHE_BLOCK_SIZE) + 1;
  int64_t e  = dir_offset_evac_bucket(eo);
  int64_t sx = e - (evacuate_size / PIN_SCAN_EVERY) - 1;
  int64_t s  = sx;
  int i;

  if (e > evacuate_size) {
    e = evacuate_size;
  }
  if (sx < 0) {
    s = 0;
  }
  for (i = s; i < e; i++) {
    evacuate_cleanup_blocks(i);
  }

  // if we have wrapped, handle the end bit
  if (sx <= 0) {
    s = evacuate_size + sx - 2;
    if (s < 0) {
      s = 0;
    }
    for (i = s; i < evacuate_size; i++) {
      evacuate_cleanup_blocks(i);
    }
  }
}

void
Vol::periodic_scan()
{
  evacuate_cleanup();
  scan_for_pinned_documents();
  if (header->write_pos == start) {
    scan_pos = start;
  }
  scan_pos += len / PIN_SCAN_EVERY;
}

void
Vol::agg_wrap()
{
  header->write_pos = start;
  header->phase     = !header->phase;

  header->cycle++;
  header->agg_pos = header->write_pos;
  dir_lookaside_cleanup(this);
  dir_clean_vol(this);
  {
    Vol *vol = this;
    CACHE_INCREMENT_DYN_STAT(cache_directory_wrap_stat);
    Note("Cache volume %d on disk '%s' wraps around", vol->cache_vol->vol_number, vol->hash_text.get());
  }
  periodic_scan();
}

/* NOTE: This state can be called by an AIO thread, so DON'T DON'T
   DON'T schedule any events on this thread using VC_SCHED_XXX or
   mutex->thread_holding->schedule_xxx_local(). ALWAYS use
   eventProcessor.schedule_xxx().
   Also, make sure that any functions called by this also use
   the eventProcessor to schedule events
*/
int
Vol::aggWrite(int event, void * /* e ATS_UNUSED */)
{
  ink_assert(!is_io_in_progress());

  Que(CacheVC, link) tocall;
  CacheVC *c;

  cancel_trigger();

Lagain:
  // calculate length of aggregated write
  for (c = static_cast<CacheVC *>(agg.head); c;) {
    int writelen = c->agg_len;
    // [amc] this is checked multiple places, on here was it strictly less.
    ink_assert(writelen <= AGG_SIZE);
    if (agg_buf_pos + writelen > AGG_SIZE || header->write_pos + agg_buf_pos + writelen > (skip + len)) {
      break;
    }
    DDebug("agg_read", "copying: %d, %" PRIu64 ", key: %d", agg_buf_pos, header->write_pos + agg_buf_pos, c->first_key.slice32(0));
    int wrotelen = agg_copy(agg_buffer + agg_buf_pos, c);
    ink_assert(writelen == wrotelen);
    agg_todo_size -= writelen;
    agg_buf_pos += writelen;
    CacheVC *n = (CacheVC *)c->link.next;
    agg.dequeue();
    if (c->f.sync && c->f.use_first_key) {
      CacheVC *last = sync.tail;
      while (last && UINT_WRAP_LT(c->write_serial, last->write_serial)) {
        last = (CacheVC *)last->link.prev;
      }
      sync.insert(c, last);
    } else if (c->f.evacuator) {
      c->handleEvent(AIO_EVENT_DONE, nullptr);
    } else {
      tocall.enqueue(c);
    }
    c = n;
  }

  // if we got nothing...
  if (!agg_buf_pos) {
    if (!agg.head && !sync.head) { // nothing to get
      return EVENT_CONT;
    }
    if (header->write_pos == start) {
      // write aggregation too long, bad bad, punt on everything.
      Note("write aggregation exceeds vol size");
      ink_assert(!tocall.head);
      ink_assert(false);
      while ((c = agg.dequeue())) {
        agg_todo_size -= c->agg_len;
        eventProcessor.schedule_imm(c, ET_CALL, AIO_EVENT_DONE);
      }
      return EVENT_CONT;
    }
    // start back
    if (agg.head) {
      agg_wrap();
      goto Lagain;
    }
  }

  // evacuate space
  off_t end = header->write_pos + agg_buf_pos + EVACUATION_SIZE;
  if (evac_range(header->write_pos, end, !header->phase) < 0) {
    goto Lwait;
  }
  if (end > skip + len) {
    if (evac_range(start, start + (end - (skip + len)), header->phase) < 0) {
      goto Lwait;
    }
  }

  // if agg.head, then we are near the end of the disk, so
  // write down the aggregation in whatever size it is.
  if (agg_buf_pos < AGG_HIGH_WATER && !agg.head && !sync.head && !dir_sync_waiting) {
    goto Lwait;
  }

  // write sync marker
  if (!agg_buf_pos) {
    ink_assert(sync.head);
    int l       = round_to_approx_size(sizeof(Doc));
    agg_buf_pos = l;
    Doc *d      = reinterpret_cast<Doc *>(agg_buffer);
    memset(static_cast<void *>(d), 0, sizeof(Doc));
    d->magic        = DOC_MAGIC;
    d->len          = l;
    d->sync_serial  = header->sync_serial;
    d->write_serial = header->write_serial;
  }

  // set write limit
  header->agg_pos = header->write_pos + agg_buf_pos;

  io.aiocb.aio_fildes = fd;
  io.aiocb.aio_offset = header->write_pos;
  io.aiocb.aio_buf    = agg_buffer;
  io.aiocb.aio_nbytes = agg_buf_pos;
  io.action           = this;
  /*
    Callback on AIO thread so that we can issue a new write ASAP
    as all writes are serialized in the volume.  This is not necessary
    for reads proceed independently.
   */
  io.thread = AIO_CALLBACK_THREAD_AIO;
  SET_HANDLER(&Vol::aggWriteDone);
  ink_aio_write(&io);

Lwait:
  int ret = EVENT_CONT;
  while ((c = tocall.dequeue())) {
    if (event == EVENT_CALL && c->mutex->thread_holding == mutex->thread_holding) {
      ret = EVENT_RETURN;
    } else {
      eventProcessor.schedule_imm(c, ET_CALL, AIO_EVENT_DONE);
    }
  }
  return ret;
}

int
CacheVC::openWriteCloseDir(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
{
  cancel_trigger();
  {
    CACHE_TRY_LOCK(lock, vol->mutex, mutex->thread_holding);
    if (!lock.is_locked()) {
      SET_HANDLER(&CacheVC::openWriteCloseDir);
      ink_assert(!is_io_in_progress());
      VC_SCHED_LOCK_RETRY();
    }
    vol->close_write(this);
    if (closed < 0 && fragment) {
      dir_delete(&earliest_key, vol, &earliest_dir);
    }
  }
  if (is_debug_tag_set("cache_update")) {
    if (f.update && closed > 0) {
      if (!total_len && !f.allow_empty_doc && alternate_index != CACHE_ALT_REMOVED) {
        Debug("cache_update", "header only %d (%" PRIu64 ", %" PRIu64 ")", DIR_MASK_TAG(first_key.slice32(2)), update_key.b[0],
              update_key.b[1]);

      } else if ((total_len || f.allow_empty_doc) && alternate_index != CACHE_ALT_REMOVED) {
        Debug("cache_update", "header body, %d, (%" PRIu64 ", %" PRIu64 "), (%" PRIu64 ", %" PRIu64 ")",
              DIR_MASK_TAG(first_key.slice32(2)), update_key.b[0], update_key.b[1], earliest_key.b[0], earliest_key.b[1]);
      } else if (!total_len && alternate_index == CACHE_ALT_REMOVED) {
        Debug("cache_update", "alt delete, %d, (%" PRIu64 ", %" PRIu64 ")", DIR_MASK_TAG(first_key.slice32(2)), update_key.b[0],
              update_key.b[1]);
      }
    }
  }
  // update the appropriate stat variable
  // These variables may not give the current no of documents with
  // one, two and three or more fragments. This is because for
  // updates we dont decrement the variable corresponding the old
  // size of the document
  if ((closed == 1) && (total_len > 0 || f.allow_empty_doc)) {
    DDebug("cache_stats", "Fragment = %d", fragment);
    switch (fragment) {
    case 0:
      CACHE_INCREMENT_DYN_STAT(cache_single_fragment_document_count_stat);
      break;
    case 1:
      CACHE_INCREMENT_DYN_STAT(cache_two_fragment_document_count_stat);
      break;
    default:
      CACHE_INCREMENT_DYN_STAT(cache_three_plus_plus_fragment_document_count_stat);
      break;
    }
  }
  if (f.close_complete) {
    recursive++;
    ink_assert(!vol || this_ethread() != vol->mutex->thread_holding);
    vio.cont->handleEvent(VC_EVENT_WRITE_COMPLETE, (void *)&vio);
    recursive--;
  }
  return free_CacheVC(this);
}

int
CacheVC::openWriteCloseHeadDone(int event, Event *e)
{
  if (event == AIO_EVENT_DONE) {
    set_io_not_in_progress();
  } else if (is_io_in_progress()) {
    return EVENT_CONT;
  }
  {
    CACHE_TRY_LOCK(lock, vol->mutex, mutex->thread_holding);
    if (!lock.is_locked()) {
      VC_LOCK_RETRY_EVENT();
    }
    od->writing_vec = false;
    if (!io.ok()) {
      goto Lclose;
    }
    ink_assert(f.use_first_key);
    if (!od->dont_update_directory) {
      if (dir_is_empty(&od->first_dir)) {
        dir_insert(&first_key, vol, &dir);
      } else {
        // multiple fragment vector write
        dir_overwrite(&first_key, vol, &dir, &od->first_dir, false);
        // insert moved resident alternate
        if (od->move_resident_alt) {
          if (dir_valid(vol, &od->single_doc_dir)) {
            dir_insert(&od->single_doc_key, vol, &od->single_doc_dir);
          }
          od->move_resident_alt = false;
        }
      }
      od->first_dir = dir;
      if (frag_type == CACHE_FRAG_TYPE_HTTP && f.single_fragment) {
        // fragment is tied to the vector
        od->move_resident_alt = true;
        if (!f.rewrite_resident_alt) {
          od->single_doc_key = earliest_key;
        }
        dir_assign(&od->single_doc_dir, &dir);
        dir_set_tag(&od->single_doc_dir, od->single_doc_key.slice32(2));
      }
    }
  }
Lclose:
  return openWriteCloseDir(event, e);
}

int
CacheVC::openWriteCloseHead(int event, Event *e)
{
  cancel_trigger();
  f.use_first_key = 1;
  if (io.ok()) {
    ink_assert(fragment || (length == (int64_t)total_len));
  } else {
    return openWriteCloseDir(event, e);
  }
  if (f.data_done) {
    write_len = 0;
  } else {
    write_len = length;
  }
  if (frag_type == CACHE_FRAG_TYPE_HTTP) {
    SET_HANDLER(&CacheVC::updateVector);
    return updateVector(EVENT_IMMEDIATE, nullptr);
  } else {
    header_len = header_to_write_len;
    SET_HANDLER(&CacheVC::openWriteCloseHeadDone);
    return do_write_lock();
  }
}

int
CacheVC::openWriteCloseDataDone(int event, Event *e)
{
  int ret = 0;
  cancel_trigger();

  if (event == AIO_EVENT_DONE) {
    set_io_not_in_progress();
  } else if (is_io_in_progress()) {
    return EVENT_CONT;
  }
  if (!io.ok()) {
    return openWriteCloseDir(event, e);
  }
  {
    CACHE_TRY_LOCK(lock, vol->mutex, this_ethread());
    if (!lock.is_locked()) {
      VC_LOCK_RETRY_EVENT();
    }
    if (!fragment) {
      ink_assert(key == earliest_key);
      earliest_dir = dir;
    } else {
      // Store the offset only if there is a table.
      // Currently there is no alt (and thence no table) for non-HTTP.
      if (alternate.valid()) {
        alternate.push_frag_offset(write_pos);
      }
    }
    fragment++;
    write_pos += write_len;
    dir_insert(&key, vol, &dir);
    blocks = iobufferblock_skip(blocks.get(), &offset, &length, write_len);
    next_CacheKey(&key, &key);
    if (length) {
      write_len = length;
      if (write_len > MAX_FRAG_SIZE) {
        write_len = MAX_FRAG_SIZE;
      }
      if ((ret = do_write_call()) == EVENT_RETURN) {
        goto Lcallreturn;
      }
      return ret;
    }
    f.data_done = 1;
    return openWriteCloseHead(event, e); // must be called under vol lock from here
  }
Lcallreturn:
  return handleEvent(AIO_EVENT_DONE, nullptr);
}

int
CacheVC::openWriteClose(int event, Event *e)
{
  cancel_trigger();
  if (is_io_in_progress()) {
    if (event != AIO_EVENT_DONE) {
      return EVENT_CONT;
    }
    set_io_not_in_progress();
    if (!io.ok()) {
      return openWriteCloseDir(event, e);
    }
  }
  if (closed > 0 || f.allow_empty_doc) {
    if (total_len == 0) {
      if (f.update || f.allow_empty_doc) {
        return updateVector(event, e);
      } else {
        // If we've been CLOSE'd but nothing has been written then
        // this close is transformed into an abort.
        closed = -1;
        return openWriteCloseDir(event, e);
      }
    }
    if (length && (fragment || length > static_cast<int>(MAX_FRAG_SIZE))) {
      SET_HANDLER(&CacheVC::openWriteCloseDataDone);
      write_len = length;
      if (write_len > MAX_FRAG_SIZE) {
        write_len = MAX_FRAG_SIZE;
      }
      return do_write_lock_call();
    } else {
      return openWriteCloseHead(event, e);
    }
  } else {
    return openWriteCloseDir(event, e);
  }
}

int
CacheVC::openWriteWriteDone(int event, Event *e)
{
  cancel_trigger();
  if (event == AIO_EVENT_DONE) {
    set_io_not_in_progress();
  } else if (is_io_in_progress()) {
    return EVENT_CONT;
  }
  // In the event of VC_EVENT_ERROR, the cont must do an io_close
  if (!io.ok()) {
    if (closed) {
      closed = -1;
      return die();
    }
    SET_HANDLER(&CacheVC::openWriteMain);
    return calluser(VC_EVENT_ERROR);
  }
  {
    CACHE_TRY_LOCK(lock, vol->mutex, mutex->thread_holding);
    if (!lock.is_locked()) {
      VC_LOCK_RETRY_EVENT();
    }
    // store the earliest directory. Need to remove the earliest dir
    // in case the writer aborts.
    if (!fragment) {
      ink_assert(key == earliest_key);
      earliest_dir = dir;
    } else {
      // Store the offset only if there is a table.
      // Currently there is no alt (and thence no table) for non-HTTP.
      if (alternate.valid()) {
        alternate.push_frag_offset(write_pos);
      }
    }
    ++fragment;
    write_pos += write_len;
    dir_insert(&key, vol, &dir);
    DDebug("cache_insert", "WriteDone: %X, %X, %d", key.slice32(0), first_key.slice32(0), write_len);
    blocks = iobufferblock_skip(blocks.get(), &offset, &length, write_len);
    next_CacheKey(&key, &key);
  }
  if (closed) {
    return die();
  }
  SET_HANDLER(&CacheVC::openWriteMain);
  return openWriteMain(event, e);
}

static inline int
target_fragment_size()
{
  uint64_t value = cache_config_target_fragment_size - sizeof(Doc);
  ink_release_assert(value <= MAX_FRAG_SIZE);
  return value;
}

int
CacheVC::openWriteMain(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
{
  cancel_trigger();
  int called_user = 0;
  ink_assert(!is_io_in_progress());
Lagain:
  if (!vio.buffer.writer()) {
    if (calluser(VC_EVENT_WRITE_READY) == EVENT_DONE) {
      return EVENT_DONE;
    }
    if (!vio.buffer.writer()) {
      return EVENT_CONT;
    }
  }
  if (vio.ntodo() <= 0) {
    called_user = 1;
    if (calluser(VC_EVENT_WRITE_COMPLETE) == EVENT_DONE) {
      return EVENT_DONE;
    }
    ink_assert(!f.close_complete || !"close expected after write COMPLETE");
    if (vio.ntodo() <= 0) {
      return EVENT_CONT;
    }
  }
  int64_t ntodo       = static_cast<int64_t>(vio.ntodo() + length);
  int64_t total_avail = vio.buffer.reader()->read_avail();
  int64_t avail       = total_avail;
  int64_t towrite     = avail + length;
  if (towrite > ntodo) {
    avail -= (towrite - ntodo);
    towrite = ntodo;
  }
  if (towrite > static_cast<int>(MAX_FRAG_SIZE)) {
    avail -= (towrite - MAX_FRAG_SIZE);
    towrite = MAX_FRAG_SIZE;
  }
  if (!blocks && towrite) {
    blocks = vio.buffer.reader()->block;
    offset = vio.buffer.reader()->start_offset;
  }
  if (avail > 0) {
    vio.buffer.reader()->consume(avail);
    vio.ndone += avail;
    total_len += avail;
  }
  length = static_cast<uint64_t>(towrite);
  if (length > target_fragment_size() && (length < target_fragment_size() + target_fragment_size() / 4)) {
    write_len = target_fragment_size();
  } else {
    write_len = length;
  }
  bool not_writing = towrite != ntodo && towrite < target_fragment_size();
  if (!called_user) {
    if (not_writing) {
      called_user = 1;
      if (calluser(VC_EVENT_WRITE_READY) == EVENT_DONE) {
        return EVENT_DONE;
      }
      goto Lagain;
    } else if (vio.ntodo() <= 0) {
      goto Lagain;
    }
  }
  if (not_writing) {
    return EVENT_CONT;
  }
  if (towrite == ntodo && f.close_complete) {
    closed = 1;
    SET_HANDLER(&CacheVC::openWriteClose);
    return openWriteClose(EVENT_NONE, nullptr);
  }
  SET_HANDLER(&CacheVC::openWriteWriteDone);
  return do_write_lock_call();
}

// begin overwrite
int
CacheVC::openWriteOverwrite(int event, Event *e)
{
  cancel_trigger();
  if (event != AIO_EVENT_DONE) {
    if (event == EVENT_IMMEDIATE) {
      last_collision = nullptr;
    }
  } else {
    Doc *doc = nullptr;
    set_io_not_in_progress();
    if (_action.cancelled) {
      return openWriteCloseDir(event, e);
    }
    if (!io.ok()) {
      goto Ldone;
    }
    doc = reinterpret_cast<Doc *>(buf->data());
    if (!(doc->first_key == first_key)) {
      goto Lcollision;
    }
    od->first_dir = dir;
    first_buf     = buf;
    goto Ldone;
  }
Lcollision : {
  CACHE_TRY_LOCK(lock, vol->mutex, this_ethread());
  if (!lock.is_locked()) {
    VC_LOCK_RETRY_EVENT();
  }
  int res = dir_probe(&first_key, vol, &dir, &last_collision);
  if (res > 0) {
    if ((res = do_read_call(&first_key)) == EVENT_RETURN) {
      goto Lcallreturn;
    }
    return res;
  }
}
Ldone:
  SET_HANDLER(&CacheVC::openWriteMain);
  return callcont(CACHE_EVENT_OPEN_WRITE);
Lcallreturn:
  return handleEvent(AIO_EVENT_DONE, nullptr); // hopefully a tail call
}

// openWriteStartDone handles vector read (addition of alternates)
// and lock misses
int
CacheVC::openWriteStartDone(int event, Event *e)
{
  intptr_t err = ECACHE_NO_DOC;
  cancel_trigger();
  if (is_io_in_progress()) {
    if (event != AIO_EVENT_DONE) {
      return EVENT_CONT;
    }
    set_io_not_in_progress();
  }
  {
    CACHE_TRY_LOCK(lock, vol->mutex, mutex->thread_holding);
    if (!lock.is_locked()) {
      VC_LOCK_RETRY_EVENT();
    }

    if (_action.cancelled && (!od || !od->has_multiple_writers())) {
      goto Lcancel;
    }

    if (event == AIO_EVENT_DONE) { // vector read done
      Doc *doc = reinterpret_cast<Doc *>(buf->data());
      if (!io.ok()) {
        err = ECACHE_READ_FAIL;
        goto Lfailure;
      }

      /* INKqa07123.
         A directory entry which is no longer valid may have been overwritten.
         We need to start afresh from the beginning by setting last_collision
         to nullptr.
       */
      if (!dir_valid(vol, &dir)) {
        DDebug("cache_write", "OpenReadStartDone: Dir not valid: Write Head: %" PRId64 ", Dir: %" PRId64,
               (int64_t)vol->offset_to_vol_offset(vol->header->write_pos), dir_offset(&dir));
        last_collision = nullptr;
        goto Lcollision;
      }
      if (!(doc->first_key == first_key)) {
        goto Lcollision;
      }

      if (doc->magic != DOC_MAGIC || !doc->hlen || this->load_http_info(write_vector, doc, buf.object()) != doc->hlen) {
        err = ECACHE_BAD_META_DATA;
        goto Lfailure;
      }
      ink_assert(write_vector->count() > 0);
      od->first_dir = dir;
      first_dir     = dir;
      if (doc->single_fragment()) {
        // fragment is tied to the vector
        od->move_resident_alt = true;
        od->single_doc_key    = doc->key;
        dir_assign(&od->single_doc_dir, &dir);
        dir_set_tag(&od->single_doc_dir, od->single_doc_key.slice32(2));
      }
      first_buf = buf;
      goto Lsuccess;
    }

  Lcollision:
    int if_writers = ((uintptr_t)info == CACHE_ALLOW_MULTIPLE_WRITES);
    if (!od) {
      if ((err = vol->open_write(this, if_writers, cache_config_http_max_alts > 1 ? cache_config_http_max_alts : 0)) > 0) {
        goto Lfailure;
      }
      if (od->has_multiple_writers()) {
        MUTEX_RELEASE(lock);
        SET_HANDLER(&CacheVC::openWriteMain);
        return callcont(CACHE_EVENT_OPEN_WRITE);
      }
    }
    // check for collision
    if (dir_probe(&first_key, vol, &dir, &last_collision)) {
      od->reading_vec = true;
      int ret         = do_read_call(&first_key);
      if (ret == EVENT_RETURN) {
        goto Lcallreturn;
      }
      return ret;
    }
    if (f.update) {
      // fail update because vector has been GC'd
      goto Lfailure;
    }
  }
Lsuccess:
  od->reading_vec = false;
  if (_action.cancelled) {
    goto Lcancel;
  }
  SET_HANDLER(&CacheVC::openWriteMain);
  return callcont(CACHE_EVENT_OPEN_WRITE);

Lfailure:
  CACHE_INCREMENT_DYN_STAT(base_stat + CACHE_STAT_FAILURE);
  _action.continuation->handleEvent(CACHE_EVENT_OPEN_WRITE_FAILED, (void *)-err);
Lcancel:
  if (od) {
    od->reading_vec = false;
    return openWriteCloseDir(event, e);
  } else {
    return free_CacheVC(this);
  }
Lcallreturn:
  return handleEvent(AIO_EVENT_DONE, nullptr); // hopefully a tail call
}

// handle lock failures from main Cache::open_write entry points below
int
CacheVC::openWriteStartBegin(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
{
  intptr_t err;
  cancel_trigger();
  if (_action.cancelled) {
    return free_CacheVC(this);
  }
  if (((err = vol->open_write_lock(this, false, 1)) > 0)) {
    CACHE_INCREMENT_DYN_STAT(base_stat + CACHE_STAT_FAILURE);
    free_CacheVC(this);
    _action.continuation->handleEvent(CACHE_EVENT_OPEN_WRITE_FAILED, (void *)-err);
    return EVENT_DONE;
  }
  if (err < 0) {
    VC_SCHED_LOCK_RETRY();
  }
  if (f.overwrite) {
    SET_HANDLER(&CacheVC::openWriteOverwrite);
    return openWriteOverwrite(EVENT_IMMEDIATE, nullptr);
  } else {
    // write by key
    SET_HANDLER(&CacheVC::openWriteMain);
    return callcont(CACHE_EVENT_OPEN_WRITE);
  }
}

// main entry point for writing of of non-http documents
Action *
Cache::open_write(Continuation *cont, const CacheKey *key, CacheFragType frag_type, int options, time_t apin_in_cache,
                  const char *hostname, int host_len)
{
  if (!CacheProcessor::IsCacheReady(frag_type)) {
    cont->handleEvent(CACHE_EVENT_OPEN_WRITE_FAILED, (void *)-ECACHE_NOT_READY);
    return ACTION_RESULT_DONE;
  }

  ink_assert(caches[frag_type] == this);

  intptr_t res      = 0;
  CacheVC *c        = new_CacheVC(cont);
  ProxyMutex *mutex = cont->mutex.get();
  SCOPED_MUTEX_LOCK(lock, c->mutex, this_ethread());
  c->vio.op    = VIO::WRITE;
  c->base_stat = cache_write_active_stat;
  c->vol       = key_to_vol(key, hostname, host_len);
  Vol *vol     = c->vol;
  CACHE_INCREMENT_DYN_STAT(c->base_stat + CACHE_STAT_ACTIVE);
  c->first_key = c->key = *key;
  c->frag_type          = frag_type;
  /*
     The transition from single fragment document to a multi-fragment document
     would cause a problem if the key and the first_key collide. In case of
     a collision, old vector data could be served to HTTP. Need to avoid that.
     Also, when evacuating a fragment, we have to decide if its the first_key
     or the earliest_key based on the dir_tag.
   */
  do {
    rand_CacheKey(&c->key, cont->mutex);
  } while (DIR_MASK_TAG(c->key.slice32(2)) == DIR_MASK_TAG(c->first_key.slice32(2)));
  c->earliest_key     = c->key;
  c->info             = nullptr;
  c->f.overwrite      = (options & CACHE_WRITE_OPT_OVERWRITE) != 0;
  c->f.close_complete = (options & CACHE_WRITE_OPT_CLOSE_COMPLETE) != 0;
  c->f.sync           = (options & CACHE_WRITE_OPT_SYNC) == CACHE_WRITE_OPT_SYNC;
  c->pin_in_cache     = static_cast<uint32_t>(apin_in_cache);

  if ((res = c->vol->open_write_lock(c, false, 1)) > 0) {
    // document currently being written, abort
    CACHE_INCREMENT_DYN_STAT(c->base_stat + CACHE_STAT_FAILURE);
    cont->handleEvent(CACHE_EVENT_OPEN_WRITE_FAILED, (void *)-res);
    free_CacheVC(c);
    return ACTION_RESULT_DONE;
  }
  if (res < 0) {
    SET_CONTINUATION_HANDLER(c, &CacheVC::openWriteStartBegin);
    c->trigger = CONT_SCHED_LOCK_RETRY(c);
    return &c->_action;
  }
  if (!c->f.overwrite) {
    SET_CONTINUATION_HANDLER(c, &CacheVC::openWriteMain);
    c->callcont(CACHE_EVENT_OPEN_WRITE);
    return ACTION_RESULT_DONE;
  } else {
    SET_CONTINUATION_HANDLER(c, &CacheVC::openWriteOverwrite);
    if (c->openWriteOverwrite(EVENT_IMMEDIATE, nullptr) == EVENT_DONE) {
      return ACTION_RESULT_DONE;
    } else {
      return &c->_action;
    }
  }
}

// main entry point for writing of http documents
Action *
Cache::open_write(Continuation *cont, const CacheKey *key, CacheHTTPInfo *info, time_t apin_in_cache,
                  const CacheKey * /* key1 ATS_UNUSED */, CacheFragType type, const char *hostname, int host_len)
{
  if (!CacheProcessor::IsCacheReady(type)) {
    cont->handleEvent(CACHE_EVENT_OPEN_WRITE_FAILED, (void *)-ECACHE_NOT_READY);
    return ACTION_RESULT_DONE;
  }

  ink_assert(caches[type] == this);
  intptr_t err      = 0;
  int if_writers    = (uintptr_t)info == CACHE_ALLOW_MULTIPLE_WRITES;
  CacheVC *c        = new_CacheVC(cont);
  ProxyMutex *mutex = cont->mutex.get();
  c->vio.op         = VIO::WRITE;
  c->first_key      = *key;
  /*
     The transition from single fragment document to a multi-fragment document
     would cause a problem if the key and the first_key collide. In case of
     a collision, old vector data could be served to HTTP. Need to avoid that.
     Also, when evacuating a fragment, we have to decide if its the first_key
     or the earliest_key based on the dir_tag.
   */
  do {
    rand_CacheKey(&c->key, cont->mutex);
  } while (DIR_MASK_TAG(c->key.slice32(2)) == DIR_MASK_TAG(c->first_key.slice32(2)));
  c->earliest_key = c->key;
  c->frag_type    = CACHE_FRAG_TYPE_HTTP;
  c->vol          = key_to_vol(key, hostname, host_len);
  Vol *vol        = c->vol;
  c->info         = info;
  if (c->info && (uintptr_t)info != CACHE_ALLOW_MULTIPLE_WRITES) {
    /*
       Update has the following code paths :
       a) Update alternate header only :
       In this case the vector has to be rewritten. The content
       length(update_len) and the key for the document are set in the
       new_info in the set_http_info call.
       HTTP OPERATIONS
       open_write with info set
       set_http_info new_info
       (total_len == 0)
       close
       b) Update alternate and data
       In this case both the vector and the data needs to be rewritten.
       This case is similar to the standard write of a document case except
       that the new_info is inserted into the vector at the alternate_index
       (overwriting the old alternate) rather than the end of the vector.
       HTTP OPERATIONS
       open_write with info set
       set_http_info new_info
       do_io_write =>  (total_len > 0)
       close
       c) Delete an alternate
       The vector may need to be deleted (if there was only one alternate) or
       rewritten (if there were more than one alternate). The deletion of the
       vector is done in openWriteRemoveVector.
       HTTP OPERATIONS
       open_write with info set
       close
     */
    c->f.update  = 1;
    c->base_stat = cache_update_active_stat;
    DDebug("cache_update", "Update called");
    info->object_key_get(&c->update_key);
    ink_assert(!(c->update_key == zero_key));
    c->update_len = info->object_size_get();
  } else {
    c->base_stat = cache_write_active_stat;
  }
  CACHE_INCREMENT_DYN_STAT(c->base_stat + CACHE_STAT_ACTIVE);
  c->pin_in_cache = static_cast<uint32_t>(apin_in_cache);

  {
    CACHE_TRY_LOCK(lock, c->vol->mutex, cont->mutex->thread_holding);
    if (lock.is_locked()) {
      if ((err = c->vol->open_write(c, if_writers, cache_config_http_max_alts > 1 ? cache_config_http_max_alts : 0)) > 0) {
        goto Lfailure;
      }
      // If there are multiple writers, then this one cannot be an update.
      // Only the first writer can do an update. If that's the case, we can
      // return success to the state machine now.;
      if (c->od->has_multiple_writers()) {
        goto Lmiss;
      }
      if (!dir_probe(key, c->vol, &c->dir, &c->last_collision)) {
        if (c->f.update) {
          // fail update because vector has been GC'd
          // This situation can also arise in openWriteStartDone
          err = ECACHE_NO_DOC;
          goto Lfailure;
        }
        // document doesn't exist, begin write
        goto Lmiss;
      } else {
        c->od->reading_vec = true;
        // document exists, read vector
        SET_CONTINUATION_HANDLER(c, &CacheVC::openWriteStartDone);
        switch (c->do_read_call(&c->first_key)) {
        case EVENT_DONE:
          return ACTION_RESULT_DONE;
        case EVENT_RETURN:
          goto Lcallreturn;
        default:
          return &c->_action;
        }
      }
    }
    // missed lock
    SET_CONTINUATION_HANDLER(c, &CacheVC::openWriteStartDone);
    CONT_SCHED_LOCK_RETRY(c);
    return &c->_action;
  }

Lmiss:
  SET_CONTINUATION_HANDLER(c, &CacheVC::openWriteMain);
  c->callcont(CACHE_EVENT_OPEN_WRITE);
  return ACTION_RESULT_DONE;

Lfailure:
  CACHE_INCREMENT_DYN_STAT(c->base_stat + CACHE_STAT_FAILURE);
  cont->handleEvent(CACHE_EVENT_OPEN_WRITE_FAILED, (void *)-err);
  if (c->od) {
    c->openWriteCloseDir(EVENT_IMMEDIATE, nullptr);
    return ACTION_RESULT_DONE;
  }
  free_CacheVC(c);
  return ACTION_RESULT_DONE;

Lcallreturn:
  if (c->handleEvent(AIO_EVENT_DONE, nullptr) == EVENT_DONE) {
    return ACTION_RESULT_DONE;
  }
  return &c->_action;
}