libblkid: use mmap() rather than read()

diff between
	perf stat -e 'syscalls:sys_enter_*'
for old and new version:

-                35      syscalls:sys_enter_lseek
-                38      syscalls:sys_enter_read
+                 3      syscalls:sys_enter_read
...
-                19      syscalls:sys_enter_mmap
+                17      syscalls:sys_enter_mmap

-       0.001083084 seconds time elapsed
+       0.000751722 seconds time elapsed

The patch dramatically reduces malloc()+seek()+read() operations in
libblkid. The code mmaps ~2MiB of the begin and the end of the device
and it moves buffers management to kernel.

Signed-off-by: Karel Zak <kzak@redhat.com>
This commit is contained in:
Karel Zak 2015-09-22 15:37:26 +02:00
parent 9325a8be9e
commit a674a0ab03
3 changed files with 144 additions and 64 deletions

View File

@ -180,6 +180,7 @@ struct blkid_struct_probe
int fd; /* device file descriptor */
blkid_loff_t off; /* begin of data on the device */
blkid_loff_t size; /* end of data on the device */
size_t mmap_granularity; /* minimal size of mmaped buffer (PAGE_SIZE) */
dev_t devno; /* device number (st.st_rdev) */
dev_t disk_devno; /* devno of the whole-disk or 0 */
@ -318,6 +319,7 @@ struct blkid_struct_cache
#define BLKID_DEBUG_READ (1 << 10)
#define BLKID_DEBUG_SAVE (1 << 11)
#define BLKID_DEBUG_TAG (1 << 12)
#define BLKID_DEBUG_BUFFER (1 << 13)
#define BLKID_DEBUG_ALL 0xFFFF /* (1 << 16) aka FFFF is expected by API */
UL_DEBUG_DECLARE_MASK(libblkid);

View File

@ -27,6 +27,7 @@ UL_DEBUG_DEFINE_MASKNAMES(libblkid) =
{ "evaluate", BLKID_DEBUG_EVALUATE, "tags resolving" },
{ "help", BLKID_DEBUG_HELP, "this help" },
{ "lowprobe", BLKID_DEBUG_LOWPROBE, "superblock/raids/partitions probing" },
{ "buffer", BLKID_DEBUG_BUFFER, "low-probing buffers" },
{ "probe", BLKID_DEBUG_PROBE, "devices verification" },
{ "read", BLKID_DEBUG_READ, "cache parsing" },
{ "save", BLKID_DEBUG_SAVE, "cache writing" },

View File

@ -104,6 +104,7 @@
#include <stdint.h>
#include <stdarg.h>
#include <limits.h>
#include <sys/mman.h>
#ifdef HAVE_LIBUUID
# include <uuid.h>
@ -580,17 +581,112 @@ int __blkid_probe_filter_types(blkid_probe pr, int chain, int flag, char *names[
return 0;
}
/* align to mmap granularity */
#define PROBE_ALIGN_OFF(p, o) ((o) & ~((p)->mmap_granularity - 1))
/* default buffer sizes */
#define PROBE_MMAP_BEGINSIZ (1024 * 1024 * 2) /* begin of the device */
#define PROBE_MMAP_ENDSIZ (1024 * 1024 * 2) /* end of the device */
#define PROBE_MMAP_MIDSIZ (1024 * 1024) /* middle of the device */
static struct blkid_bufinfo *mmap_buffer(blkid_probe pr,
blkid_loff_t real_off,
blkid_loff_t len)
{
size_t map_len;
blkid_loff_t map_off = 0;
struct blkid_bufinfo *bf = NULL;
/*
* libblkid heavily reads begin and end of the device, so it seems
* better to mmap ~2MiB from the begin and end of the device to reduces
* number of syscalls and necessary buffers. For random accees
* somewhere in the middle of the device we use 1MiB buffers.
*/
if (!pr->mmap_granularity)
pr->mmap_granularity = getpagesize();
/* begin of the device */
if (real_off == 0 || real_off + len < PROBE_MMAP_BEGINSIZ) {
DBG(BUFFER, ul_debug("\tmapping begin of the device"));
map_off = 0;
map_len = PROBE_MMAP_BEGINSIZ > pr->size ? pr->size : PROBE_MMAP_BEGINSIZ;
/* end of the device */
} else if (real_off > pr->off + pr->size - PROBE_MMAP_ENDSIZ) {
DBG(BUFFER, ul_debug("\tmapping end of the device"));
map_off = PROBE_ALIGN_OFF(pr, pr->off + pr->size - PROBE_MMAP_ENDSIZ);
map_len = pr->off + pr->size - map_off;
/* middle of the device */
} else {
blkid_loff_t minlen;
map_off = PROBE_ALIGN_OFF(pr, real_off);
minlen = real_off + len - map_off;
map_len = minlen > PROBE_MMAP_MIDSIZ ? minlen : PROBE_MMAP_MIDSIZ;
if (map_off + map_len > pr->off + pr->size)
map_len = pr->size - map_off;
}
assert(map_off <= real_off);
assert(map_off + map_len >= real_off + len);
/* allocate buffer handler */
bf = malloc(sizeof(*bf));
if (!bf) {
errno = ENOMEM;
return NULL;
}
/* mmap into memmory */
bf->data = mmap(NULL, map_len, PROT_READ, MAP_SHARED, pr->fd, map_off);
if (bf->data == MAP_FAILED) {
DBG(BUFFER, ul_debug("\tmmap failed: %m"));
free(bf);
return NULL;
}
bf->off = map_off;
bf->len = map_len;
INIT_LIST_HEAD(&bf->bufs);
DBG(BUFFER, ul_debug("\tmmap %p: off=%ju, len=%ju (%ju pages)",
bf->data, map_off, map_len,
map_len / pr->mmap_granularity));
return bf;
}
/*
* Note that @off is offset within probing area, the probing area is defined by
* pr->off and pr->size.
*/
unsigned char *blkid_probe_get_buffer(blkid_probe pr,
blkid_loff_t off, blkid_loff_t len)
{
struct list_head *p;
struct blkid_bufinfo *bf = NULL;
blkid_loff_t real_off = pr->off + off;
/*
DBG(BUFFER, ul_debug("\t>>>> off=%ju, real-off=%ju (probe <%ju..%ju>, len=%ju",
off, real_off, pr->off, pr->off + pr->size, len));
*/
if (pr->size <= 0) {
errno = EINVAL;
return NULL;
}
if (len == 0 || pr->off + pr->size < real_off + len) {
DBG(BUFFER, ul_debug("\t ignore: request out of probing area"));
errno = 0;
return NULL;
}
if (pr->parent &&
pr->parent->devno == pr->devno &&
pr->parent->off <= pr->off &&
@ -600,88 +696,64 @@ unsigned char *blkid_probe_get_buffer(blkid_probe pr,
* parent. Let's use parent's buffers.
*
* Note that pr->off (and pr->parent->off) is always from the
* beginig of the device.
* begin of the device.
*/
return blkid_probe_get_buffer(pr->parent,
pr->off + off - pr->parent->off, len);
}
/* try buffers we already have in memmory */
list_for_each(p, &pr->buffers) {
struct blkid_bufinfo *x =
list_entry(p, struct blkid_bufinfo, bufs);
if (x->off <= off && off + len <= x->off + x->len) {
DBG(LOWPROBE, ul_debug("\treuse buffer: off=%jd len=%jd pr=%p",
x->off, x->len, pr));
if (real_off >= x->off && real_off + len <= x->off + x->len) {
DBG(BUFFER, ul_debug("\treuse %p: off=%jd len=%jd",
x->data, x->off, x->len));
bf = x;
break;
}
}
/* not found; read from disk */
if (!bf) {
ssize_t ret;
if (blkid_llseek(pr->fd, pr->off + off, SEEK_SET) < 0) {
errno = 0;
bf = mmap_buffer(pr, real_off, len);
if (!bf)
return NULL;
}
/* someone trying to overflow some buffers? */
if (len > ULONG_MAX - sizeof(struct blkid_bufinfo)) {
errno = ENOMEM;
return NULL;
}
/* allocate info and space for data by why call */
bf = calloc(1, sizeof(struct blkid_bufinfo) + len);
if (!bf) {
errno = ENOMEM;
return NULL;
}
bf->data = ((unsigned char *) bf) + sizeof(struct blkid_bufinfo);
bf->len = len;
bf->off = off;
INIT_LIST_HEAD(&bf->bufs);
DBG(LOWPROBE, ul_debug("\tbuffer read: off=%jd len=%jd pr=%p",
off, len, pr));
ret = read(pr->fd, bf->data, len);
if (ret != (ssize_t) len) {
DBG(LOWPROBE, ul_debug("\tbuffer read: return %zd error %m", ret));
free(bf);
if (ret >= 0)
errno = 0;
return NULL;
}
list_add_tail(&bf->bufs, &pr->buffers);
}
assert(bf->off <= real_off);
assert(bf->off + bf->len >= real_off + len);
errno = 0;
return off ? bf->data + (off - bf->off) : bf->data;
return real_off ? bf->data + (real_off - bf->off) : bf->data;
}
static void blkid_probe_reset_buffer(blkid_probe pr)
{
uint64_t read_ct = 0, len_ct = 0;
uint64_t mmap_ct = 0, len_ct = 0;
if (!pr || list_empty(&pr->buffers))
return;
DBG(LOWPROBE, ul_debug("reseting probing buffers pr=%p", pr));
DBG(BUFFER, ul_debug("reseting probing buffers pr=%p", pr));
while (!list_empty(&pr->buffers)) {
struct blkid_bufinfo *bf = list_entry(pr->buffers.next,
struct blkid_bufinfo, bufs);
read_ct++;
mmap_ct++;
len_ct += bf->len;
list_del(&bf->bufs);
DBG(BUFFER, ul_debug(" unmap: %p [off=%ju, len=%ju]", bf->data, bf->off, bf->len));
munmap(bf->data, bf->len);
free(bf);
}
DBG(LOWPROBE, ul_debug("buffers summary: %"PRIu64" bytes "
"by %"PRIu64" read() call(s)",
len_ct, read_ct));
DBG(LOWPROBE, ul_debug("buffers summary: %ju bytes (%ju pages) by %ju mmap() call(s)",
len_ct, len_ct / pr->mmap_granularity, mmap_ct));
INIT_LIST_HEAD(&pr->buffers);
}
@ -734,6 +806,7 @@ int blkid_probe_set_device(blkid_probe pr, int fd,
blkid_loff_t off, blkid_loff_t size)
{
struct stat sb;
blkid_loff_t devsiz = 0;
if (!pr)
return -1;
@ -766,33 +839,35 @@ int blkid_probe_set_device(blkid_probe pr, int fd,
if (fstat(fd, &sb))
goto err;
if (!S_ISBLK(sb.st_mode) && !S_ISCHR(sb.st_mode) && !S_ISREG(sb.st_mode))
if (!S_ISBLK(sb.st_mode) && !S_ISCHR(sb.st_mode) && !S_ISREG(sb.st_mode)) {
errno = EINVAL;
goto err;
}
pr->mode = sb.st_mode;
if (S_ISBLK(sb.st_mode) || S_ISCHR(sb.st_mode))
pr->devno = sb.st_rdev;
if (size)
pr->size = size;
else {
if (S_ISBLK(sb.st_mode)) {
if (blkdev_get_size(fd, (unsigned long long *) &pr->size)) {
DBG(LOWPROBE, ul_debug("failed to get device size"));
goto err;
}
} else if (S_ISCHR(sb.st_mode))
pr->size = 1; /* UBI devices are char... */
else if (S_ISREG(sb.st_mode))
pr->size = sb.st_size; /* regular file */
if (pr->off > pr->size)
if (S_ISBLK(sb.st_mode)) {
if (blkdev_get_size(fd, (unsigned long long *) &devsiz)) {
DBG(LOWPROBE, ul_debug("failed to get device size"));
goto err;
}
} else if (S_ISCHR(sb.st_mode))
devsiz = 1; /* UBI devices are char... */
else if (S_ISREG(sb.st_mode))
devsiz = sb.st_size; /* regular file */
/* The probing area cannot be larger than whole device, pr->off
* is offset within the device */
pr->size -= pr->off;
pr->size = size ? size : devsiz;
if (off && size == 0)
/* only offset without size specified */
pr->size -= off;
if (pr->off + pr->size > devsiz) {
DBG(LOWPROBE, ul_debug("area specified by offset and size is bigger than device"));
errno = EINVAL;
goto err;
}
if (pr->size <= 1440 * 1024 && !S_ISCHR(sb.st_mode))
@ -887,8 +962,10 @@ int blkid_probe_get_idmag(blkid_probe pr, const struct blkid_idinfo *id,
if (!buf && errno)
return -errno;
if (buf && !memcmp(mag->magic,
buf + (mag->sboff & 0x3ff), mag->len)) {
DBG(LOWPROBE, ul_debug("\tmagic sboff=%u, kboff=%ld",
mag->sboff, mag->kboff));
if (offset)