
来源:互联网 发布:360企业云盘mac版 编辑:程序博客网 时间:2024/06/03 21:24

ceph version: Kraken


int ObjectStore::probe_block_device_fsid(                                                                                                                                                      CephContext *cct,  const string& path,  uuid_d *fsid){  int r;//优先选择bluestore#if defined(HAVE_LIBAIO)  // first try bluestore -- it has a crc on its header and will fail  // reliably.   r = BlueStore::get_block_device_fsid(cct, path, fsid);  if (r == 0) {                                        ¦ lgeneric_dout(cct, 0) << __func__ << " " << path << " is bluestore, "                          << *fsid << dendl;  ¦ return r;   }#endif  // okay, try FileStore (journal).  r = FileStore::get_block_device_fsid(cct, path, fsid);  if (r == 0) {  ¦ lgeneric_dout(cct, 0) << __func__ << " " << path << " is filestore, "                          << *fsid << dendl;  ¦ return r;  }  return -EINVAL;}
BlueStore 获取osd文件系统的OSD uuid,该uuid保存在内存结构的bluestore_bdev_label_t,该结构保存在磁盘的第一个块中。
int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,                                                                                                                        ¦ ¦ ¦uuid_d *fsid)                                                                                                                {                                                                                                                                                                   bluestore_bdev_label_t label;                                                                                                                                     int r = _read_bdev_label(cct, path, &label);                                                                                                                                                 if (r < 0)                                                                                                                                                        ¦ return r;                                                                                                                                                       *fsid = label.osd_uuid;                                                                                                                                           return 0;                                                                                                                                                       }
int BlueStore::_read_bdev_label(CephContext* cct, string path,                                bluestore_bdev_label_t *label){  dout(10) << __func__ << dendl;  //打开设备  int fd = ::open(path.c_str(), O_RDONLY);  if (fd < 0) {  ¦ fd = -errno;  ¦ derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)         << dendl;  ¦ return fd;  }  bufferlist bl;  //从设备中读取指定大小的数据  int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE); //BDEV_LABEL_BLOCK_SIZE = 4096第一个数据块  VOID_TEMP_FAILURE_RETRY(::close(fd));  if (r < 0) {  ¦ derr << __func__ << " failed to read from " << path         << ": " << cpp_strerror(r) << dendl;  ¦ return r;  }//校验数据的完整性,并将其反序列化  uint32_t crc, expected_crc;  bufferlist::iterator p = bl.begin();  try {  ¦ ::decode(*label, p);  ¦ bufferlist t;  ¦ t.substr_of(bl, 0, p.get_off());  ¦ crc = t.crc32c(-1);  ¦ ::decode(expected_crc, p);  }  catch (buffer::error& e) {  ¦ derr << __func__ << " unable to decode label at offset " << p.get_off()         << ": " << e.what()         << dendl;  ¦ return -EINVAL;  }  if (crc != expected_crc) {  ¦ derr << __func__ << " bad crc on label, expected " << expected_crc         << " != actual " << crc << dendl;                                                                                                                                                     ¦ return -EIO;  }  dout(10) << __func__ << " got " << *label << dendl;  return 0;}

FileStore 获取osd文件系统的OSD uuid

int FileStore::get_block_device_fsid(CephContext* cct, const string& path,                                     uuid_d *fsid){  // make sure we don't try to use aio or direct_io (and get annoying  // error messages from failing to do so); performance implications  // should be irrelevant for this use  FileJournal j(cct, *fsid, 0, 0, path.c_str(), false, false);  return j.peek_fsid(*fsid);                                                                                                                                                                 }// This can not be used on an active journalint FileJournal::peek_fsid(uuid_d& fsid){  assert(fd == -1);  int r = _open(false, false);  if (r)  ¦ return r;  r = read_header(&header);  if (r < 0)  ¦ goto out;  fsid = header.fsid;out:  close();  return r;} int FileJournal::_open(bool forwrite, bool create){  int flags, ret;  if (forwrite) {  ¦ flags = O_RDWR;  ¦ if (directio)  ¦ ¦ flags |= O_DIRECT | O_DSYNC;  } else {  ¦ flags = O_RDONLY;  }  if (create)  ¦ flags |= O_CREAT;  if (fd >= 0) {  ¦ if (TEMP_FAILURE_RETRY(::close(fd))) {  ¦ ¦ int err = errno;  ¦ ¦ derr << "FileJournal::_open: error closing old fd: "        ¦ ¦<< cpp_strerror(err) << dendl;  ¦ }  }  //打开日志设备  fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags, 0644));  if (fd < 0) {  ¦ int err = errno;  ¦ dout(2) << "FileJournal::_open unable to open journal "        ¦ ¦ << fn << ": " << cpp_strerror(err) << dendl;  ¦ return -err;  }//获取指定文件的元信息,读取初始化日志文件(或设备)的相关数据(大小,块大小)  struct stat st;  ret = ::fstat(fd, &st);  if (ret) {  ¦ ret = errno;  ¦ derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl;  ¦ ret = -ret;  ¦ goto out_fd;  }   //判断是常规文件还是裸块设备  if (S_ISBLK(st.st_mode)) {  ¦ ret = _open_block_device();  } else if (S_ISREG(st.st_mode)) {  ¦ if (aio && !force_aio) {  ¦ ¦ derr << "FileJournal::_open: disabling aio for non-block journal.  Use "        ¦ ¦<< "journal_force_aio to force use of aio anyway" << dendl;  ¦ ¦ aio = false;  ¦ }  ¦ ret = _open_file(st.st_size, st.st_blksize, create);  } else {  ¦ derr << "FileJournal::_open: wrong journal file type: " << st.st_mode        ¦<< dendl;  ¦ ret = -EINVAL;  }  if (ret)  ¦ goto out_fd;//初始化libaio#ifdef HAVE_LIBAIO  if (aio) {  ¦ aio_ctx = 0;  ¦ ret = io_setup(128, &aio_ctx);  ¦ if (ret < 0) {  ¦ ¦ switch (ret) {        // Contrary to naive expectations -EAGIAN means ...        case -EAGAIN:        ¦ derr << "FileJournal::_open: user's limit of aio events exceeded. "        ¦ ¦ ¦ ¦<< "Try increasing /proc/sys/fs/aio-max-nr" << dendl;        ¦ break;        default:        ¦ derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(-ret) << dendl;        ¦ break;  ¦ ¦ }  ¦ ¦ goto out_fd;  ¦ }  }#endif  /* We really want max_size to be a multiple of block_size. */  max_size -= max_size % block_size;  dout(1) << "_open " << fn << " fd " << fd        ¦ << ": " << max_size        ¦ << " bytes, block size " << block_size        ¦ << " bytes, directio = " << directio        ¦ << ", aio = " << aio        ¦ << dendl;  return 0; out_fd:  VOID_TEMP_FAILURE_RETRY(::close(fd));  fd = -1;  return ret;}


int FileJournal::_open_block_device(){  int64_t bdev_sz = 0;  int ret = get_block_device_size(fd, &bdev_sz);  if (ret) {  ¦ dout(0) << __func__ << ": failed to read block device size." << dendl;  ¦ return -EIO;  }  /* Check for bdev_sz too small */  if (bdev_sz < ONE_MEG) {  ¦ dout(0) << __func__ << ": your block device must be at least "  ¦ ¦ << ONE_MEG << " bytes to be used for a Ceph journal." << dendl;  ¦ return -EINVAL;  }  dout(10) << __func__ << ": ignoring osd journal size. "        ¦ ¦<< "We'll use the entire block device (size: " << bdev_sz << ")"        ¦ ¦<< dendl;  max_size = bdev_sz;  block_size = cct->_conf->journal_block_size;  if (cct->_conf->journal_discard) {  //获取磁盘对discard的支持(/sys/block/sdb/queue/discard_granularity)  ¦ discard = block_device_support_discard(fn.c_str());  ¦ dout(10) << fn << " support discard: " << (int)discard << dendl;  }  return 0;}//获取块设备的大小int get_block_device_size(int fd, int64_t *psize){                                                                                                                                                                                            #ifdef BLKGETSIZE64  int ret = ::ioctl(fd, BLKGETSIZE64, psize);#elif defined(BLKGETSIZE)  unsigned long sectors = 0;  int ret = ::ioctl(fd, BLKGETSIZE, &sectors);  *psize = sectors * 512ULL;#else// cppcheck-suppress preprocessorErrorDirective# error "Linux configuration error (get_block_device_size)"#endif  if (ret < 0)    ret = -errno;  return ret; }
int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,                                                                                                                                                          bool create){  int ret;  //配置日志文件的大小  int64_t conf_journal_sz(cct->_conf->osd_journal_size);  conf_journal_sz <<= 20;  if ((cct->_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) {  ¦ derr << "I'm sorry, I don't know how large of a journal to create."        ¦<< "Please specify a block device to use as the journal OR "        ¦<< "set osd_journal_size in your ceph.conf" << dendl;  ¦ return -EINVAL;  }  if (create && (oldsize < conf_journal_sz)) {  ¦ uint64_t newsize(conf_journal_sz);  ¦ dout(10) <<  __func__ << " _open extending to " << newsize << " bytes" << dendl;  //扩展日志文件大小,但是该方法只分配了虚拟的空间,即没有实际的数据块  ¦ ret = ::ftruncate(fd, newsize);  ¦ if (ret < 0) {  ¦ ¦ int err = errno;  ¦ ¦ derr << "FileJournal::_open_file : unable to extend journal to "        ¦ ¦<< newsize << " bytes: " << cpp_strerror(err) << dendl;  ¦ ¦ return -err;  ¦ }#ifdef HAVE_POSIX_FALLOCATE//为文件分配实际的磁盘空间,以防止磁盘空间不足导致写入失败。  ¦ ret = ::posix_fallocate(fd, 0, newsize);  ¦ if (ret) {  ¦ ¦ derr << "FileJournal::_open_file : unable to preallocation journal to "        ¦ ¦<< newsize << " bytes: " << cpp_strerror(ret) << dendl;  ¦ ¦ return -ret;  ¦ }  ¦ max_size = newsize;#elif defined(__APPLE__)  ¦ fstore_t store;  ¦ store.fst_flags = F_ALLOCATECONTIG;  ¦ store.fst_posmode = F_PEOFPOSMODE;  ¦ store.fst_offset = 0;  ¦ store.fst_length = newsize;//同上  ¦ ret = ::fcntl(fd, F_PREALLOCATE, &store);  ¦ if (ret == -1) {  ¦ ¦ ret = -errno;  ¦ ¦ derr << "FileJournal::_open_file : unable to preallocation journal to "        ¦ ¦<< newsize << " bytes: " << cpp_strerror(ret) << dendl;  ¦ ¦ return ret;  ¦ }  ¦ max_size = newsize;#else# error "Journal pre-allocation not supported on platform."#endif  }  else {  ¦ max_size = oldsize;  }  block_size = cct->_conf->journal_block_size;//初始化日志空间,通过填充‘0’  if (create && cct->_conf->journal_zero_on_create) {  ¦ derr << "FileJournal::_open_file : zeroing journal" << dendl;  ¦ uint64_t write_size = 1 << 20;  ¦ char *buf;  //申请一块block_size内存对其的write_size大小的内存空间。  ¦ ret = ::posix_memalign((void **)&buf, block_size, write_size);  ¦ if (ret != 0) {  ¦ ¦ return -ret;  ¦ }  ¦ memset(static_cast<void*>(buf), 0, write_size);  ¦ uint64_t i = 0;  ¦ for (; (i + write_size) <= (uint64_t)max_size; i += write_size) {  ¦ ¦ ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);  ¦ ¦ if (ret < 0) {        free(buf);        return -errno;  ¦ ¦ }  ¦ }  ¦ if (i < (uint64_t)max_size) {  ¦ ¦ ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);  ¦ ¦ if (ret < 0) {        free(buf);        return -errno;  ¦ ¦ }  ¦ }  ¦ free(buf);  }  dout(10) << "_open journal is not a block device, NOT checking disk "  ¦ ¦ ¦ ¦ ¦<< "write cache on '" << fn << "'" << dendl;  return 0;}
int FileJournal::read_header(header_t *hdr) const{  dout(10) << "read_header" << dendl;  bufferlist bl;  buffer::ptr bp = buffer::create_page_aligned(block_size);  char* bpdata = bp.c_str();  int r = ::pread(fd, bpdata, bp.length(), 0);  if (r < 0) {  ¦ int err = errno;  ¦ dout(0) << "read_header got " << cpp_strerror(err) << dendl;  ¦ return -err;  }  // don't use bp.zero() here, because it also invalidates  // crc cache (which is not yet populated anyway)  if (bp.length() != (size_t)r) {  ¦ ¦ // r will be always less or equal than bp.length  ¦ ¦ bpdata += r;  ¦ ¦ memset(bpdata, 0, bp.length() - r);  }  bl.push_back(std::move(bp));  try {  ¦ bufferlist::iterator p = bl.begin();  ¦ ::decode(*hdr, p);  }  catch (buffer::error& e) {  ¦ derr << "read_header error decoding journal header" << dendl;  ¦ return -EINVAL;  }  /*  ¦* Unfortunately we weren't initializing the flags field for new  ¦* journals!  Aie.  This is safe(ish) now that we have only one  ¦* flag.  Probably around when we add the next flag we need to  ¦* remove this or else this (eventually old) code will clobber newer  ¦* code's flags.  ¦*/  if (hdr->flags > 3) {  ¦ derr << "read_header appears to have gibberish flags; assuming 0" << dendl;  ¦ hdr->flags = 0;  }  print_header(*hdr);  return 0;}void FileJournal::print_header(const header_t &header) const                                                                                                                                 {  dout(10) << "header: block_size " << header.block_size        ¦ ¦<< " alignment " << header.alignment        ¦ ¦<< " max_size " << header.max_size        ¦ ¦<< dendl;  dout(10) << "header: start " << header.start << dendl;  dout(10) << " write_pos " << write_pos << dendl;} 