simple-ssd

created : 2020-06-10T10:39:41+00:00
modified : 2020-08-26T11:54:02+00:00
ssd

새롭게 알게된 것


문서 읽기

그림으로 그려보기

Host Controller

Host Interface
class DMAInterface {
 public:
  DMAInterface() {}
  virtual ~DMAInterface() {}

  virtual void dmaRead(uint64_t, uint64_t, uint8_t *, DMAFunction &,
                       void * = nullptr) = 0;
  virtual void dmaWrite(uint64_t, uint64_t, uint8_t *, DMAFunction &,
                        void * = nullptr) = 0;
};
class Interface : public SimpleSSD::DMAInterface {
 protected:
  Controller *pController;

 public:
  virtual void updateInterrupt(uint16_t, bool) = 0;
  virtual void getVendorID(uint16_t &, uint16_t &) = 0;
};
Controller and Firmware
Controller
Subsystem
Namespace

Serial AT Attachment

Host interface
Host Bus Adapter
Device

Universal Flash Storage

Host Interface
Host Controller inteface
Device

nvme 소스코드와의 비교

프로그램 시작점

BIO 발생 -> 드라이버

request로부터 opcode 분리

memset(cmd, 0, 64);

uint64_t slba = bio.offset / LBAsize; uint32_t nlb = (uint32_t)DIVCEIL(bio.length, LBAsize);

cmd[1] = namespaceID; // NSID

if (bio.type == BIL::BIO_READ) { cmd[0] = SimpleSSD::HIL::NVMe::OPCODE_READ; // CID, FUSE, OPC cmd[10] = (uint32_t)slba; cmd[11] = slba » 32; cmd[12] = nlb - 1; // LR, FUA, PRINFO, NLB

prp = new PRP(bio.length);
prp->getPointer(*(uint64_t *)(cmd + 6), *(uint64_t *)(cmd + 8));  // DPTR   }   else if (bio.type == BIL::BIO_WRITE) {
cmd[0] = SimpleSSD::HIL::NVMe::OPCODE_WRITE;  // CID, FUSE, OPC
cmd[10] = (uint32_t)slba;
cmd[11] = slba >> 32;
cmd[12] = nlb - 1;  // LR, FUA, PRINFO, DTYPE, NLB

prp = new PRP(bio.length);
prp->getPointer(*(uint64_t *)(cmd + 6), *(uint64_t *)(cmd + 8));  // DPTR   }   else if (bio.type == BIL::BIO_FLUSH) {
cmd[0] = SimpleSSD::HIL::NVMe::OPCODE_FLUSH;  // CID, FUSE, OPC   }   else if (bio.type == BIL::BIO_TRIM) {
cmd[0] = SimpleSSD::HIL::NVMe::OPCODE_DATASET_MANAGEMEMT;  // CID, FUSE, OPC
cmd[10] = 0;                                               // NR
cmd[11] = 0x04;                                            // AD

prp = new PRP(16);
prp->getPointer(*(uint64_t *)(cmd + 6), *(uint64_t *)(cmd + 8));  // DPTR

// Fill range definition
uint8_t data[16];

memset(data, 0, 16);
memcpy(data + 4, &nlb, 4);
memcpy(data + 8, &slba, 8);

prp->writeData(0, 16, data);   }

submitCommand(1, (uint8_t *)cmd, callback, new IOWrapper(bio.id, prp, bio.callback)); }


###### nvme(driver) 구현
```c
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
		struct nvme_command *cmd)
{
	blk_status_t ret = BLK_STS_OK;

	nvme_clear_nvme_request(req);

	memset(cmd, 0, sizeof(*cmd));
	switch (req_op(req)) {
	case REQ_OP_DRV_IN:
	case REQ_OP_DRV_OUT:
		memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
		break;
	case REQ_OP_FLUSH:
		nvme_setup_flush(ns, cmd);
		break;
	case REQ_OP_WRITE_ZEROES:
		ret = nvme_setup_write_zeroes(ns, req, cmd);
		break;
	case REQ_OP_DISCARD:
		ret = nvme_setup_discard(ns, req, cmd);
		break;
	case REQ_OP_READ:
	case REQ_OP_WRITE:
		ret = nvme_setup_rw(ns, req, cmd);
		break;
	default:
		WARN_ON_ONCE(1);
		return BLK_STS_IOERR;
	}

	cmd->common.command_id = req->tag;
	trace_nvme_setup_cmd(req, cmd);
	return ret;
}
submit command

// Push to queue if (iv == 0) { increaseCommandID(adminCommandID); cid = adminCommandID; queue = adminSQ; } else if (iv == 1 && ioSQ) { increaseCommandID(ioCommandID); cid = ioCommandID; queue = ioSQ; } else { SimpleSSD::panic(“I/O Submission Queue is not initialized”); }

memcpy(cmd + 2, &cid, 2); queue->setData(cmd, 64); tail = queue->getTail();

// Push to pending cmd list pendingCommandList.push_back(CommandEntry(iv, opcode, cid, context, func));

// Ring doorbell pController->ringSQTailDoorbell(iv, tail, tick); queue->incrHead(); }

##### nvme구현
```c
/**
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
 * @nvmeq: The queue to use
 * @cmd: The command to send
 * @write_sq: whether to write to the SQ doorbell
 */
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
			    bool write_sq)
{
	spin_lock(&nvmeq->sq_lock);
	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
	       cmd, sizeof(*cmd));
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
	if (write_sq)
		nvme_write_sq_db(nvmeq);
	spin_unlock(&nvmeq->sq_lock);
}

Work

SimpleSSD

SSD Interface

Internal Cache Layer

Abstract Class

Set-Associative Cache

호출구조

if (response) { func(resp); } }


 * 이때 `read` 함수등 은 다시 `Subsystem::read()`를 호출하고 이 내부는
```cpp
void Subsystem::read(Namespace *ns, uint64_t slba, uint64_t nlblk,
                     DMAFunction &func, void *context) {
  Request *req = new Request(func, context);
  DMAFunction doRead = [this](uint64_t, void *context) {
    auto req = (Request *)context;

    pHIL->read(*req);

    delete req;
  };

  convertUnit(ns, slba, nlblk, *req);

  execute(CPU::NVME__SUBSYSTEM, CPU::CONVERT_UNIT, doRead, req);
}

Cache 로직 공부

debugprint(LOG_ICL_GENERIC_CACHE, “READ | REQ %7u-%-4u | LCA %” PRIu64 “ | SIZE %” PRIu64, req.reqID, req.reqSubID, req.range.slpn, req.length);

if (useReadCaching) { /* 읽기용 Cache가 있는지를 확인한다. / / start logical page number ? 의 약자인듯 / uint32_t setIdx = calcSetIndex(req.range.slpn); / Set-Associative Cache 를 참조 */ uint32_t wayIdx; uint64_t arrived = tick;

/* 이건 먼지 모르겠다. 연속적인 request를 체크하는건가 ?
 * 찾아보니 prefetch 와 관련된 건데, 언제나 predict 를 sequential 이라고 생각하고 하는건가?
 * 물론 prefetch 할 영역을 고르는데 어려운 알고리즘을 쓰면 문제가 많겠지만 Sequential 만 prefetch 하는게
 * 좋다라는 논문이 있나?
 * TODO: 논문 리딩
*/
if (useReadPrefetch) {
  checkSequential(req, readDetect);
}

wayIdx = getValidWay(req.range.slpn, tick);

// Do we have valid data?
if (wayIdx != waySize) {
  uint64_t tickBackup = tick;

  // Wait cache to be valid
  if (tick < cacheData[setIdx][wayIdx].insertedAt) {
    tick = cacheData[setIdx][wayIdx].insertedAt;
  }

  // Update last accessed time
  cacheData[setIdx][wayIdx].lastAccessed = tick;

  // DRAM access
  pDRAM->read(&cacheData[setIdx][wayIdx], req.length, tick);

  debugprint(LOG_ICL_GENERIC_CACHE,
             "READ  | Cache hit at (%u, %u) | %" PRIu64 " - %" PRIu64
             " (%" PRIu64 ")",
             setIdx, wayIdx, arrived, tick, tick - arrived);

  ret = true;

  // Do we need to prefetch data?
  if (useReadPrefetch && req.range.slpn == prefetchTrigger) {
    debugprint(LOG_ICL_GENERIC_CACHE, "READ  | Prefetch triggered");

    req.range.slpn = lastPrefetched;

    // Backup tick
    arrived = tick;
    tick = tickBackup;

    goto ICL_GENERIC_CACHE_READ;
  }
}
// We should read data from NVM
else {
ICL_GENERIC_CACHE_READ:
  FTL::Request reqInternal(lineCountInSuperPage, req);
  std::vector<std::pair<uint64_t, uint64_t>> readList;
  uint32_t row, col;  // Variable for I/O position (IOFlag)
  uint64_t dramAt;
  uint64_t beginLCA, endLCA;
  uint64_t beginAt, finishedAt = tick;

  if (readDetect.enabled) {
    // TEMP: Disable DRAM calculation for prevent conflict
    pDRAM->setScheduling(false);

    if (!ret) {
      debugprint(LOG_ICL_GENERIC_CACHE, "READ  | Read ahead triggered");
    }

    beginLCA = req.range.slpn;

    // If super-page is disabled, just read all pages from all planes
    if (prefetchMode == MODE_ALL || !bSuperPage) {
      endLCA = beginLCA + lineCountInMaxIO;
      prefetchTrigger = beginLCA + lineCountInMaxIO / 2;
    }
    else {
      endLCA = beginLCA + lineCountInSuperPage;
      prefetchTrigger = beginLCA + lineCountInSuperPage / 2;
    }

    lastPrefetched = endLCA;
  }
  else {
    beginLCA = req.range.slpn;
    endLCA = beginLCA + 1;
  }

  for (uint64_t lca = beginLCA; lca < endLCA; lca++) {
    beginAt = tick;

    // Check cache
    if (getValidWay(lca, beginAt) != waySize) {
      continue;
    }

    // Find way to write data read from NVM
    setIdx = calcSetIndex(lca);
    wayIdx = getEmptyWay(setIdx, beginAt);

    if (wayIdx == waySize) {
      wayIdx = evictFunction(setIdx, beginAt);

      if (cacheData[setIdx][wayIdx].dirty) {
        // We need to evict data before write
        calcIOPosition(cacheData[setIdx][wayIdx].tag, row, col);
        evictData[row][col] = cacheData[setIdx] + wayIdx;
      }
    }

    cacheData[setIdx][wayIdx].insertedAt = beginAt;
    cacheData[setIdx][wayIdx].lastAccessed = beginAt;
    cacheData[setIdx][wayIdx].valid = true;
    cacheData[setIdx][wayIdx].dirty = false;

    readList.push_back({lca, ((uint64_t)setIdx << 32) | wayIdx});

    finishedAt = MAX(finishedAt, beginAt);
  }

  tick = finishedAt;

  evictCache(tick);

  for (auto &iter : readList) {
    Line *pLine = &cacheData[iter.second >> 32][iter.second & 0xFFFFFFFF];

    // Read data
    reqInternal.lpn = iter.first / lineCountInSuperPage;
    reqInternal.ioFlag.reset();
    reqInternal.ioFlag.set(iter.first % lineCountInSuperPage);

    beginAt = tick;  // Ignore cache metadata access

    // If superPageSizeData is true, read first LPN only
    pFTL->read(reqInternal, beginAt);

    // DRAM delay
    dramAt = pLine->insertedAt;
    pDRAM->write(pLine, lineSize, dramAt);

    // Set cache data
    beginAt = MAX(beginAt, dramAt);

    pLine->insertedAt = beginAt;
    pLine->lastAccessed = beginAt;
    pLine->tag = iter.first;

    if (pLine->tag == req.range.slpn) {
      finishedAt = beginAt;
    }

    debugprint(LOG_ICL_GENERIC_CACHE,
               "READ  | Cache miss at (%u, %u) | %" PRIu64 " - %" PRIu64
               " (%" PRIu64 ")",
               iter.second >> 32, iter.second & 0xFFFFFFFF, tick, beginAt,
               beginAt - tick);
  }

  tick = finishedAt;

  if (readDetect.enabled) {
    if (ret) {
      // This request was prefetch
      debugprint(LOG_ICL_GENERIC_CACHE, "READ  | Prefetch done");

      // Restore tick
      tick = arrived;
    }
    else {
      debugprint(LOG_ICL_GENERIC_CACHE, "READ  | Read ahead done");
    }

    // TEMP: Restore
    pDRAM->setScheduling(true);
  }
}

tick += applyLatency(CPU::ICL__GENERIC_CACHE, CPU::READ);   }   else {
FTL::Request reqInternal(lineCountInSuperPage, req);

pDRAM->write(nullptr, req.length, tick);

pFTL->read(reqInternal, tick);   }

stat.request[0]++;

if (ret) { stat.cache[0]++; }

return ret; }

```