From 772430dd67955850942d689714ab982da24257ba Mon Sep 17 00:00:00 2001 From: Kirk McKusick Date: Fri, 17 Nov 2023 14:10:29 -0800 Subject: [PATCH] Ensure I/O buffers in libufs(3) are 128-byte aligned. Various disk controllers require their buffers to be aligned to a cache-line size (128 bytes). For buffers allocated in structures, ensure that they are 128-byte aligned. Use aligned_malloc to allocate memory to ensure that the returned memory is 128-byte aligned. While we are here, we replace the dynamically allocated inode buffer with a buffer allocated in the uufsd structure just as the superblock and cylinder group buffers do. This can be removed if/when the kernel is fixed. Because this problem has existed on one I/O subsystem or another since the 1990's, we are probably stuck with dealing with it forever. The problem most recent showed up in Azure, see: https://reviews.freebsd.org/D41728 https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=267654 Before these fixes were applied, it was confirmed that the changes in this commit also fixed the issue in Azure. Reviewed-by: Warner Losh, kib Tested-by: Souradeep Chakrabarti of Microsoft (earlier version) PR: 267654 Differential Revision: https://reviews.freebsd.org/D41724 --- lib/libufs/Makefile | 2 +- lib/libufs/block.c | 45 ++++++++-------------------- lib/libufs/inode.c | 16 ++-------- lib/libufs/libufs.h | 59 ++++++++++++++++++++++++------------- lib/libufs/sblock.c | 3 +- lib/libufs/type.c | 12 ++++---- lib/libufs/ufs_disk_close.3 | 9 +++++- sbin/fsck_ffs/fsck.h | 15 ++++++++++ sbin/fsck_ffs/fsutil.c | 11 ++++--- sbin/fsck_ffs/inode.c | 3 +- sbin/fsck_ffs/main.c | 1 - sbin/fsck_ffs/pass5.c | 1 - sbin/fsck_ffs/setup.c | 11 ++++--- sbin/fsck_ffs/suj.c | 3 +- 14 files changed, 99 insertions(+), 92 deletions(-) diff --git a/lib/libufs/Makefile b/lib/libufs/Makefile index e37a285bb509..42d76c6061d2 100644 --- a/lib/libufs/Makefile +++ b/lib/libufs/Makefile @@ -2,7 +2,7 @@ PACKAGE= ufs LIB= ufs SHLIBDIR?= /lib -SHLIB_MAJOR= 7 +SHLIB_MAJOR= 8 SRCS= block.c cgroup.c gsb_crc32.c inode.c sblock.c type.c ffs_subr.c SRCS+= ffs_tables.c diff --git a/lib/libufs/block.c b/lib/libufs/block.c index 20002fc4df1b..e15216aefa8d 100644 --- a/lib/libufs/block.c +++ b/lib/libufs/block.c @@ -57,19 +57,10 @@ bread(struct uufsd *disk, ufs2_daddr_t blockno, void *data, size_t size) ERROR(disk, NULL); - p2 = data; - /* - * XXX: various disk controllers require alignment of our buffer - * XXX: which is stricter than struct alignment. - * XXX: Bounce the buffer if not 64 byte aligned. - * XXX: this can be removed if/when the kernel is fixed - */ - if (((intptr_t)data) & 0x3f) { - p2 = malloc(size); - if (p2 == NULL) { - ERROR(disk, "allocate bounce buffer"); - goto fail; - } + BUF_MALLOC(&p2, data, size); + if (p2 == NULL) { + ERROR(disk, "allocate bounce buffer"); + goto fail; } cnt = pread(disk->d_fd, p2, size, (off_t)(blockno * disk->d_bsize)); if (cnt == -1) { @@ -101,7 +92,7 @@ bwrite(struct uufsd *disk, ufs2_daddr_t blockno, const void *data, size_t size) { ssize_t cnt; int rv; - void *p2 = NULL; + void *p2; ERROR(disk, NULL); @@ -110,24 +101,15 @@ bwrite(struct uufsd *disk, ufs2_daddr_t blockno, const void *data, size_t size) ERROR(disk, "failed to open disk for writing"); return (-1); } - - /* - * XXX: various disk controllers require alignment of our buffer - * XXX: which is stricter than struct alignment. - * XXX: Bounce the buffer if not 64 byte aligned. - * XXX: this can be removed if/when the kernel is fixed - */ - if (((intptr_t)data) & 0x3f) { - p2 = malloc(size); - if (p2 == NULL) { - ERROR(disk, "allocate bounce buffer"); - return (-1); - } - memcpy(p2, data, size); - data = p2; + BUF_MALLOC(&p2, data, size); + if (p2 == NULL) { + ERROR(disk, "allocate bounce buffer"); + return (-1); } - cnt = pwrite(disk->d_fd, data, size, (off_t)(blockno * disk->d_bsize)); - if (p2 != NULL) + if (p2 != data) + memcpy(p2, data, size); + cnt = pwrite(disk->d_fd, p2, size, (off_t)(blockno * disk->d_bsize)); + if (p2 != data) free(p2); if (cnt == -1) { ERROR(disk, "write error to block device"); @@ -137,7 +119,6 @@ bwrite(struct uufsd *disk, ufs2_daddr_t blockno, const void *data, size_t size) ERROR(disk, "short write to block device"); return (-1); } - return (cnt); } diff --git a/lib/libufs/inode.c b/lib/libufs/inode.c index fe34fd45b815..3865e50591de 100644 --- a/lib/libufs/inode.c +++ b/lib/libufs/inode.c @@ -62,18 +62,10 @@ getinode(struct uufsd *disk, union dinodep *dp, ino_t inum) ERROR(disk, "inode number out of range"); return (-1); } - inoblock = disk->d_inoblock; + inoblock = (caddr_t)&disk->d_inos[0]; min = disk->d_inomin; max = disk->d_inomax; - if (inoblock == NULL) { - inoblock = malloc(fs->fs_bsize); - if (inoblock == NULL) { - ERROR(disk, "unable to allocate inode block"); - return (-1); - } - disk->d_inoblock = inoblock; - } if (inum >= min && inum < max) goto gotit; bread(disk, fsbtodb(fs, ino_to_fsba(fs, inum)), inoblock, @@ -107,14 +99,10 @@ putinode(struct uufsd *disk) struct fs *fs; fs = &disk->d_fs; - if (disk->d_inoblock == NULL) { - ERROR(disk, "No inode block allocated"); - return (-1); - } if (disk->d_ufs == 2) ffs_update_dinode_ckhash(fs, disk->d_dp.dp2); if (bwrite(disk, fsbtodb(fs, ino_to_fsba(&disk->d_fs, disk->d_inomin)), - disk->d_inoblock, disk->d_fs.fs_bsize) <= 0) + (caddr_t)&disk->d_inos[0], disk->d_fs.fs_bsize) <= 0) return (-1); return (0); } diff --git a/lib/libufs/libufs.h b/lib/libufs/libufs.h index 4c6242e9daef..45ac97f43c06 100644 --- a/lib/libufs/libufs.h +++ b/lib/libufs/libufs.h @@ -30,6 +30,13 @@ #ifndef __LIBUFS_H__ #define __LIBUFS_H__ +/* + * Various disk controllers require their buffers to be aligned to the size + * of a cache line. The LIBUFS_BUFALIGN defines the required alignment size. + * The alignment must be a power of 2. + */ +#define LIBUFS_BUFALIGN 128 + /* * libufs structures. */ @@ -42,39 +49,51 @@ union dinodep { * userland ufs disk. */ struct uufsd { - const char *d_name; /* disk name */ - int d_ufs; /* decimal UFS version */ - int d_fd; /* raw device file descriptor */ - long d_bsize; /* device bsize */ - ufs2_daddr_t d_sblock; /* superblock location */ - struct fs_summary_info *d_si; /* Superblock summary info */ - caddr_t d_inoblock; /* inode block */ - uint32_t d_inomin; /* low ino, not ino_t for ABI compat */ - uint32_t d_inomax; /* high ino, not ino_t for ABI compat */ - union dinodep d_dp; /* pointer to currently active inode */ union { struct fs d_fs; /* filesystem information */ - char d_sb[MAXBSIZE]; /* superblock as buffer */ - } d_sbunion; + char d_sb[SBLOCKSIZE]; /* superblock as buffer */ + } d_sbunion __aligned(LIBUFS_BUFALIGN); union { struct cg d_cg; /* cylinder group */ char d_buf[MAXBSIZE]; /* cylinder group storage */ - } d_cgunion; - int d_ccg; /* current cylinder group */ - int d_lcg; /* last cylinder group (in d_cg) */ + } d_cgunion __aligned(LIBUFS_BUFALIGN); + union { + union dinodep d_ino[1]; /* inode block */ + char d_inos[MAXBSIZE]; /* inode block as buffer */ + } d_inosunion __aligned(LIBUFS_BUFALIGN); + const char *d_name; /* disk name */ const char *d_error; /* human readable disk error */ + ufs2_daddr_t d_sblock; /* superblock location */ + struct fs_summary_info *d_si; /* Superblock summary info */ + union dinodep d_dp; /* pointer to currently active inode */ + ino_t d_inomin; /* low ino */ + ino_t d_inomax; /* high ino */ off_t d_sblockloc; /* where to look for the superblock */ - int d_lookupflags; /* flags to superblock lookup */ - int d_mine; /* internal flags */ -#define d_fs d_sbunion.d_fs -#define d_sb d_sbunion.d_sb -#define d_cg d_cgunion.d_cg + int64_t d_bsize; /* device bsize */ + int64_t d_lookupflags; /* flags to superblock lookup */ + int64_t d_mine; /* internal flags */ + int32_t d_ccg; /* current cylinder group */ + int32_t d_ufs; /* decimal UFS version */ + int32_t d_fd; /* raw device file descriptor */ + int32_t d_lcg; /* last cylinder group (in d_cg) */ }; +#define d_inos d_inosunion.d_inos +#define d_fs d_sbunion.d_fs +#define d_cg d_cgunion.d_cg /* * libufs macros (internal, non-exported). */ #ifdef _LIBUFS +/* + * Ensure that the buffer is aligned to the I/O subsystem requirements. + */ +#define BUF_MALLOC(newbufpp, data, size) { \ + if (data != NULL && (((intptr_t)data) & (LIBUFS_BUFALIGN - 1)) == 0) \ + *newbufpp = (void *)data; \ + else \ + *newbufpp = aligned_alloc(LIBUFS_BUFALIGN, size); \ +} /* * Trace steps through libufs, to be used at entry and erroneous return. */ diff --git a/lib/libufs/sblock.c b/lib/libufs/sblock.c index 9f1d8a2485bd..59cd44de04ab 100644 --- a/lib/libufs/sblock.c +++ b/lib/libufs/sblock.c @@ -228,7 +228,8 @@ use_pread(void *devfd, off_t loc, void **bufp, int size) int fd; fd = *(int *)devfd; - if ((*bufp = malloc(size)) == NULL) + BUF_MALLOC(bufp, NULL, size); + if (*bufp == NULL) return (ENOSPC); if (pread(fd, *bufp, size, loc) != size) return (EIO); diff --git a/lib/libufs/type.c b/lib/libufs/type.c index 1d0c4c0200eb..99aec0a57e9a 100644 --- a/lib/libufs/type.c +++ b/lib/libufs/type.c @@ -61,10 +61,6 @@ ufs_disk_close(struct uufsd *disk) ERROR(disk, NULL); close(disk->d_fd); disk->d_fd = -1; - if (disk->d_inoblock != NULL) { - free(disk->d_inoblock); - disk->d_inoblock = NULL; - } if (disk->d_mine & MINE_NAME) { free((char *)(uintptr_t)disk->d_name); disk->d_name = NULL; @@ -155,10 +151,16 @@ again: if ((ret = stat(name, &st)) < 0) { return (-1); } + if (((uintptr_t)disk & ~(LIBUFS_BUFALIGN - 1)) != (uintptr_t)disk) { + ERROR(disk, "uufsd structure must be aligned to " + "LIBUFS_BUFALIGN byte boundry, see ufs_disk_fillout(3)"); + close(fd); + return (-1); + } + disk->d_bsize = 1; disk->d_ccg = 0; disk->d_fd = fd; - disk->d_inoblock = NULL; disk->d_inomin = 0; disk->d_inomax = 0; disk->d_lcg = 0; diff --git a/lib/libufs/ufs_disk_close.3 b/lib/libufs/ufs_disk_close.3 index d0d93d05838c..f332a9bb5de9 100644 --- a/lib/libufs/ufs_disk_close.3 +++ b/lib/libufs/ufs_disk_close.3 @@ -9,7 +9,7 @@ .\" .\" This file is in the public domain. .\" -.Dd June 4, 2003 +.Dd November 17, 2023 .Dt UFS_DISK_CLOSE 3 .Os .Sh NAME @@ -51,6 +51,13 @@ functions open a disk specified by .Fa name and populate the structure pointed to by .Fa disk . +The structure referenced by the +.Fa disk +pointer must be aligned to at least the alignment specified by +.Dv LIBUFS_ALIGN +that is defined in the +.Lb libufs.h +header file. The disk is opened read-only. The specified .Fa name diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h index 3b795783f39c..827336a77d67 100644 --- a/sbin/fsck_ffs/fsck.h +++ b/sbin/fsck_ffs/fsck.h @@ -67,6 +67,7 @@ #include #include #include +#include #include @@ -424,6 +425,20 @@ Malloc(size_t size) break; return (retval); } +/* + * Allocate a block of memory to be used as an I/O buffer. + * Ensure that the buffer is aligned to the I/O subsystem requirements. + */ +static inline void* +Balloc(size_t size) +{ + void *retval; + + while ((retval = aligned_alloc(LIBUFS_BUFALIGN, size)) == NULL) + if (flushentry() == 0) + break; + return (retval); +} /* * Wrapper for calloc() that flushes the cylinder group cache to try diff --git a/sbin/fsck_ffs/fsutil.c b/sbin/fsck_ffs/fsutil.c index 2583e324e94c..05f83789236e 100644 --- a/sbin/fsck_ffs/fsutil.c +++ b/sbin/fsck_ffs/fsutil.c @@ -58,7 +58,6 @@ static const char sccsid[] = "@(#)utilities.c 8.6 (Berkeley) 5/19/95"; #include #include #include -#include #include "fsck.h" @@ -189,7 +188,7 @@ bufinit(void) initbarea(&failedbuf, BT_UNKNOWN); failedbuf.b_errs = -1; failedbuf.b_un.b_buf = NULL; - if ((cgblk.b_un.b_buf = Malloc((unsigned int)sblock.fs_bsize)) == NULL) + if ((cgblk.b_un.b_buf = Balloc((unsigned int)sblock.fs_bsize)) == NULL) errx(EEXIT, "Initial malloc(%d) failed", sblock.fs_bsize); initbarea(&cgblk, BT_CYLGRP); numbufs = cachelookups = cachereads = 0; @@ -211,7 +210,7 @@ allocbuf(const char *failreason) char *bufp; bp = (struct bufarea *)Malloc(sizeof(struct bufarea)); - bufp = Malloc((unsigned int)sblock.fs_bsize); + bufp = Balloc((unsigned int)sblock.fs_bsize); if (bp == NULL || bufp == NULL) { errx(EEXIT, "%s", failreason); /* NOTREACHED */ @@ -241,7 +240,7 @@ cglookup(int cg) if ((unsigned) cg >= sblock.fs_ncg) errx(EEXIT, "cglookup: out of range cylinder group %d", cg); if (cgbufs == NULL) { - cgbufs = calloc(sblock.fs_ncg, sizeof(struct bufarea)); + cgbufs = Calloc(sblock.fs_ncg, sizeof(struct bufarea)); if (cgbufs == NULL) errx(EEXIT, "Cannot allocate cylinder group buffers"); } @@ -250,7 +249,7 @@ cglookup(int cg) return (cgbp); cgp = NULL; if (flushtries == 0) - cgp = Malloc((unsigned int)sblock.fs_cgsize); + cgp = Balloc((unsigned int)sblock.fs_cgsize); if (cgp == NULL) { if (sujrecovery) errx(EEXIT,"Ran out of memory during journal recovery"); @@ -966,7 +965,7 @@ blzero(int fd, ufs2_daddr_t blk, long size) if (fd < 0) return; if (zero == NULL) { - zero = calloc(ZEROBUFSIZE, 1); + zero = Balloc(ZEROBUFSIZE); if (zero == NULL) errx(EEXIT, "cannot allocate buffer pool"); } diff --git a/sbin/fsck_ffs/inode.c b/sbin/fsck_ffs/inode.c index 3db8a5e5c23d..e4349ff97088 100644 --- a/sbin/fsck_ffs/inode.c +++ b/sbin/fsck_ffs/inode.c @@ -48,7 +48,6 @@ static const char sccsid[] = "@(#)inode.c 8.8 (Berkeley) 4/28/95"; #include #include #include -#include #include "fsck.h" @@ -646,7 +645,7 @@ setinodebuf(int cg, ino_t inosused) inobufsize = blkroundup(&sblock, MAX(INOBUFSIZE, sblock.fs_bsize)); initbarea(&inobuf, BT_INODES); - if ((inobuf.b_un.b_buf = Malloc((unsigned)inobufsize)) == NULL) + if ((inobuf.b_un.b_buf = Balloc((unsigned)inobufsize)) == NULL) errx(EEXIT, "cannot allocate space for inode buffer"); } fullcnt = inobufsize / ((sblock.fs_magic == FS_UFS1_MAGIC) ? diff --git a/sbin/fsck_ffs/main.c b/sbin/fsck_ffs/main.c index ee39760f4dd2..4189af1ba517 100644 --- a/sbin/fsck_ffs/main.c +++ b/sbin/fsck_ffs/main.c @@ -59,7 +59,6 @@ static char sccsid[] = "@(#)main.c 8.6 (Berkeley) 5/14/95"; #include #include #include -#include #include #include #include diff --git a/sbin/fsck_ffs/pass5.c b/sbin/fsck_ffs/pass5.c index 58de6791903f..8980ba60e03a 100644 --- a/sbin/fsck_ffs/pass5.c +++ b/sbin/fsck_ffs/pass5.c @@ -45,7 +45,6 @@ static const char sccsid[] = "@(#)pass5.c 8.9 (Berkeley) 4/28/95"; #include #include #include -#include #include "fsck.h" diff --git a/sbin/fsck_ffs/setup.c b/sbin/fsck_ffs/setup.c index c9aa19c7eded..b3d58749015e 100644 --- a/sbin/fsck_ffs/setup.c +++ b/sbin/fsck_ffs/setup.c @@ -52,7 +52,6 @@ static const char sccsid[] = "@(#)setup.c 8.10 (Berkeley) 5/9/95"; #include #include #include -#include #include "fsck.h" @@ -214,7 +213,7 @@ setup(char *dev) sbdirty(); } if (snapcnt > 0 && copybuf == NULL) { - copybuf = Malloc(sblock.fs_bsize); + copybuf = Balloc(sblock.fs_bsize); if (copybuf == NULL) errx(EEXIT, "cannot allocate space for snapshot " "copy buffer"); @@ -501,7 +500,7 @@ sblock_init(void) fsmodified = 0; lfdir = 0; initbarea(&sblk, BT_SUPERBLK); - sblk.b_un.b_buf = Malloc(SBLOCKSIZE); + sblk.b_un.b_buf = Balloc(SBLOCKSIZE); if (sblk.b_un.b_buf == NULL) errx(EEXIT, "cannot allocate space for superblock"); dev_bsize = secsize = DEV_BSIZE; @@ -530,7 +529,7 @@ calcsb(char *dev, int devfd, struct fs *fs) */ if (ioctl(devfd, DIOCGSECTORSIZE, &secsize) == -1) return (0); - fsrbuf = Malloc(secsize); + fsrbuf = Balloc(secsize); if (fsrbuf == NULL) errx(EEXIT, "calcsb: cannot allocate recovery buffer"); if (blread(devfd, fsrbuf, @@ -573,7 +572,7 @@ chkrecovery(int devfd) rdsize = sblock.fs_fsize; if (ioctl(devfd, DIOCGSECTORSIZE, &secsize) == -1 || rdsize % secsize != 0 || - (fsrbuf = Malloc(rdsize)) == NULL || + (fsrbuf = Balloc(rdsize)) == NULL || blread(devfd, fsrbuf, (SBLOCK_UFS2 - rdsize) / dev_bsize, rdsize) != 0) { free(fsrbuf); @@ -612,7 +611,7 @@ saverecovery(int readfd, int writefd) if (sblock.fs_magic != FS_UFS2_MAGIC || ioctl(readfd, DIOCGSECTORSIZE, &secsize) == -1 || rdsize % secsize != 0 || - (fsrbuf = Malloc(rdsize)) == NULL || + (fsrbuf = Balloc(rdsize)) == NULL || blread(readfd, fsrbuf, (SBLOCK_UFS2 - rdsize) / dev_bsize, rdsize) != 0) { printf("RECOVERY DATA COULD NOT BE CREATED\n"); diff --git a/sbin/fsck_ffs/suj.c b/sbin/fsck_ffs/suj.c index c66b605bd69d..e1fd54229d69 100644 --- a/sbin/fsck_ffs/suj.c +++ b/sbin/fsck_ffs/suj.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include @@ -2274,7 +2273,7 @@ suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) static void suj_read(void) { - uint8_t block[1 * 1024 * 1024]; + uint8_t block[1 * 1024 * 1024] __aligned(LIBUFS_BUFALIGN); struct suj_seg *seg; struct jsegrec *recn; struct jsegrec *rec;