Fix a device departure bug for the the pass(4), enc(4), sg(4) and ch(4)

drivers.

The bug occurrs when a userland process has the driver instance
open and the underlying device goes away.  We get the devfs
callback that the device node has been destroyed, but not all of
the closes necessary to fully decrement the reference count on the
CAM peripheral.

The reason is that once devfs calls back and says the device has
been destroyed, it is moved off to deadfs, and devfs guarantees
that there will be no more open or close calls.  So the solution
is to keep track of how many outstanding open calls there are on
the device, and just release that many references when we get the
callback from devfs.

scsi_pass.c,
scsi_enc.c,
scsi_enc_internal.h:	Add an open count to the softc in these
			drivers.  Increment it on open and
			decrement it on close.

			When we get a devfs callback to say that
			the device node has gone away, decrement
			the peripheral reference count by the
			number of still outstanding opens.

			Make sure we don't access the peripheral
			with cam_periph_unlock() after what might
			be the final call to
			cam_periph_release_locked().  The
			peripheral might have been freed, and we
			will be dereferencing freed memory.

scsi_ch.c,
scsi_sg.c:		For the ch(4) and sg(4) drivers, add the
			same changes described above, and in
			addition, fix another bug that was
			previously fixed in the pass(4) and enc(4)
			drivers.

			These drivers were calling destroy_dev()
			from their cleanup routine, but that could
			cause a deadlock because the cleanup
			routine could be indirectly called from
			the driver's close routine.  This would
			cause a deadlock, because the device node
			is being held open by the active close
			call, and can't be destroyed.

Sponsored by:	Spectra Logic Corporation
MFC after:	1 week
This commit is contained in:
Kenneth D. Merry 2012-12-08 04:03:04 +00:00
parent e2adc47dbb
commit 86d45c7f3b
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=244014
5 changed files with 305 additions and 15 deletions

View File

@ -144,7 +144,8 @@ struct ch_softc {
ch_quirks quirks; ch_quirks quirks;
union ccb saved_ccb; union ccb saved_ccb;
struct devstat *device_stats; struct devstat *device_stats;
struct cdev *dev; struct cdev *dev;
int open_count;
int sc_picker; /* current picker */ int sc_picker; /* current picker */
@ -236,6 +237,48 @@ chinit(void)
} }
} }
static void
chdevgonecb(void *arg)
{
struct cam_sim *sim;
struct ch_softc *softc;
struct cam_periph *periph;
int i;
periph = (struct cam_periph *)arg;
sim = periph->sim;
softc = (struct ch_softc *)periph->softc;
KASSERT(softc->open_count >= 0, ("Negative open count %d",
softc->open_count));
mtx_lock(sim->mtx);
/*
* When we get this callback, we will get no more close calls from
* devfs. So if we have any dangling opens, we need to release the
* reference held for that particular context.
*/
for (i = 0; i < softc->open_count; i++)
cam_periph_release_locked(periph);
softc->open_count = 0;
/*
* Release the reference held for the device node, it is gone now.
*/
cam_periph_release_locked(periph);
/*
* We reference the SIM lock directly here, instead of using
* cam_periph_unlock(). The reason is that the final call to
* cam_periph_release_locked() above could result in the periph
* getting freed. If that is the case, dereferencing the periph
* with a cam_periph_unlock() call would cause a page fault.
*/
mtx_unlock(sim->mtx);
}
static void static void
choninvalidate(struct cam_periph *periph) choninvalidate(struct cam_periph *periph)
{ {
@ -250,6 +293,12 @@ choninvalidate(struct cam_periph *periph)
softc->flags |= CH_FLAG_INVALID; softc->flags |= CH_FLAG_INVALID;
/*
* Tell devfs this device has gone away, and ask for a callback
* when it has cleaned up its state.
*/
destroy_dev_sched_cb(softc->dev, chdevgonecb, periph);
xpt_print(periph->path, "lost device\n"); xpt_print(periph->path, "lost device\n");
} }
@ -262,10 +311,9 @@ chcleanup(struct cam_periph *periph)
softc = (struct ch_softc *)periph->softc; softc = (struct ch_softc *)periph->softc;
xpt_print(periph->path, "removing device entry\n"); xpt_print(periph->path, "removing device entry\n");
devstat_remove_entry(softc->device_stats); devstat_remove_entry(softc->device_stats);
cam_periph_unlock(periph);
destroy_dev(softc->dev);
cam_periph_lock(periph);
free(softc, M_DEVBUF); free(softc, M_DEVBUF);
} }
@ -359,6 +407,19 @@ chregister(struct cam_periph *periph, void *arg)
XPORT_DEVSTAT_TYPE(cpi.transport), XPORT_DEVSTAT_TYPE(cpi.transport),
DEVSTAT_PRIORITY_OTHER); DEVSTAT_PRIORITY_OTHER);
/*
* Acquire a reference to the periph before we create the devfs
* instance for it. We'll release this reference once the devfs
* instance has been freed.
*/
if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
xpt_print(periph->path, "%s: lost periph during "
"registration!\n", __func__);
cam_periph_lock(periph);
return (CAM_REQ_CMP_ERR);
}
/* Register the device */ /* Register the device */
softc->dev = make_dev(&ch_cdevsw, periph->unit_number, UID_ROOT, softc->dev = make_dev(&ch_cdevsw, periph->unit_number, UID_ROOT,
GID_OPERATOR, 0600, "%s%d", periph->periph_name, GID_OPERATOR, 0600, "%s%d", periph->periph_name,
@ -419,6 +480,9 @@ chopen(struct cdev *dev, int flags, int fmt, struct thread *td)
} }
cam_periph_unhold(periph); cam_periph_unhold(periph);
softc->open_count++;
cam_periph_unlock(periph); cam_periph_unlock(periph);
return(error); return(error);
@ -427,13 +491,36 @@ chopen(struct cdev *dev, int flags, int fmt, struct thread *td)
static int static int
chclose(struct cdev *dev, int flag, int fmt, struct thread *td) chclose(struct cdev *dev, int flag, int fmt, struct thread *td)
{ {
struct cam_sim *sim;
struct cam_periph *periph; struct cam_periph *periph;
struct ch_softc *softc;
periph = (struct cam_periph *)dev->si_drv1; periph = (struct cam_periph *)dev->si_drv1;
if (periph == NULL) if (periph == NULL)
return(ENXIO); return(ENXIO);
cam_periph_release(periph); sim = periph->sim;
softc = (struct ch_softc *)periph->softc;
mtx_lock(sim->mtx);
softc->open_count--;
cam_periph_release_locked(periph);
/*
* We reference the SIM lock directly here, instead of using
* cam_periph_unlock(). The reason is that the call to
* cam_periph_release_locked() above could result in the periph
* getting freed. If that is the case, dereferencing the periph
* with a cam_periph_unlock() call would cause a page fault.
*
* cam_periph_release() avoids this problem using the same method,
* but we're manually acquiring and dropping the lock here to
* protect the open count and avoid another lock acquisition and
* release.
*/
mtx_unlock(sim->mtx);
return(0); return(0);
} }

View File

@ -111,11 +111,40 @@ enc_init(void)
static void static void
enc_devgonecb(void *arg) enc_devgonecb(void *arg)
{ {
struct cam_sim *sim;
struct cam_periph *periph; struct cam_periph *periph;
struct enc_softc *enc;
int i;
periph = (struct cam_periph *)arg; periph = (struct cam_periph *)arg;
sim = periph->sim;
enc = (struct enc_softc *)periph->softc;
cam_periph_release(periph); mtx_lock(sim->mtx);
/*
* When we get this callback, we will get no more close calls from
* devfs. So if we have any dangling opens, we need to release the
* reference held for that particular context.
*/
for (i = 0; i < enc->open_count; i++)
cam_periph_release_locked(periph);
enc->open_count = 0;
/*
* Release the reference held for the device node, it is gone now.
*/
cam_periph_release_locked(periph);
/*
* We reference the SIM lock directly here, instead of using
* cam_periph_unlock(). The reason is that the final call to
* cam_periph_release_locked() above could result in the periph
* getting freed. If that is the case, dereferencing the periph
* with a cam_periph_unlock() call would cause a page fault.
*/
mtx_unlock(sim->mtx);
} }
static void static void
@ -262,6 +291,8 @@ enc_open(struct cdev *dev, int flags, int fmt, struct thread *td)
out: out:
if (error != 0) if (error != 0)
cam_periph_release_locked(periph); cam_periph_release_locked(periph);
else
softc->open_count++;
cam_periph_unlock(periph); cam_periph_unlock(periph);
@ -271,13 +302,36 @@ out:
static int static int
enc_close(struct cdev *dev, int flag, int fmt, struct thread *td) enc_close(struct cdev *dev, int flag, int fmt, struct thread *td)
{ {
struct cam_sim *sim;
struct cam_periph *periph; struct cam_periph *periph;
struct enc_softc *enc;
periph = (struct cam_periph *)dev->si_drv1; periph = (struct cam_periph *)dev->si_drv1;
if (periph == NULL) if (periph == NULL)
return (ENXIO); return (ENXIO);
cam_periph_release(periph); sim = periph->sim;
enc = periph->softc;
mtx_lock(sim->mtx);
enc->open_count--;
cam_periph_release_locked(periph);
/*
* We reference the SIM lock directly here, instead of using
* cam_periph_unlock(). The reason is that the call to
* cam_periph_release_locked() above could result in the periph
* getting freed. If that is the case, dereferencing the periph
* with a cam_periph_unlock() call would cause a page fault.
*
* cam_periph_release() avoids this problem using the same method,
* but we're manually acquiring and dropping the lock here to
* protect the open count and avoid another lock acquisition and
* release.
*/
mtx_unlock(sim->mtx);
return (0); return (0);
} }
@ -946,6 +1000,11 @@ enc_ctor(struct cam_periph *periph, void *arg)
} }
} }
/*
* Acquire a reference to the periph before we create the devfs
* instance for it. We'll release this reference once the devfs
* instance has been freed.
*/
if (cam_periph_acquire(periph) != CAM_REQ_CMP) { if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
xpt_print(periph->path, "%s: lost periph during " xpt_print(periph->path, "%s: lost periph during "
"registration!\n", __func__); "registration!\n", __func__);

View File

@ -148,6 +148,7 @@ struct enc_softc {
union ccb saved_ccb; union ccb saved_ccb;
struct cdev *enc_dev; struct cdev *enc_dev;
struct cam_periph *periph; struct cam_periph *periph;
int open_count;
/* Bitmap of pending operations. */ /* Bitmap of pending operations. */
uint32_t pending_actions; uint32_t pending_actions;

View File

@ -76,6 +76,7 @@ struct pass_softc {
pass_flags flags; pass_flags flags;
u_int8_t pd_type; u_int8_t pd_type;
union ccb saved_ccb; union ccb saved_ccb;
int open_count;
struct devstat *device_stats; struct devstat *device_stats;
struct cdev *dev; struct cdev *dev;
struct cdev *alias_dev; struct cdev *alias_dev;
@ -140,12 +141,43 @@ passinit(void)
static void static void
passdevgonecb(void *arg) passdevgonecb(void *arg)
{ {
struct cam_sim *sim;
struct cam_periph *periph; struct cam_periph *periph;
struct pass_softc *softc;
int i;
periph = (struct cam_periph *)arg; periph = (struct cam_periph *)arg;
sim = periph->sim;
softc = (struct pass_softc *)periph->softc;
xpt_print(periph->path, "%s: devfs entry is gone\n", __func__); KASSERT(softc->open_count >= 0, ("Negative open count %d",
cam_periph_release(periph); softc->open_count));
mtx_lock(sim->mtx);
/*
* When we get this callback, we will get no more close calls from
* devfs. So if we have any dangling opens, we need to release the
* reference held for that particular context.
*/
for (i = 0; i < softc->open_count; i++)
cam_periph_release_locked(periph);
softc->open_count = 0;
/*
* Release the reference held for the device node, it is gone now.
*/
cam_periph_release_locked(periph);
/*
* We reference the SIM lock directly here, instead of using
* cam_periph_unlock(). The reason is that the final call to
* cam_periph_release_locked() above could result in the periph
* getting freed. If that is the case, dereferencing the periph
* with a cam_periph_unlock() call would cause a page fault.
*/
mtx_unlock(sim->mtx);
} }
static void static void
@ -368,7 +400,7 @@ passregister(struct cam_periph *periph, void *arg)
if (cam_periph_acquire(periph) != CAM_REQ_CMP) { if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
xpt_print(periph->path, "%s: lost periph during " xpt_print(periph->path, "%s: lost periph during "
"registration!\n", __func__); "registration!\n", __func__);
mtx_lock(periph->sim->mtx); cam_periph_lock(periph);
return (CAM_REQ_CMP_ERR); return (CAM_REQ_CMP_ERR);
} }
@ -461,6 +493,8 @@ passopen(struct cdev *dev, int flags, int fmt, struct thread *td)
return(EINVAL); return(EINVAL);
} }
softc->open_count++;
cam_periph_unlock(periph); cam_periph_unlock(periph);
return (error); return (error);
@ -469,13 +503,36 @@ passopen(struct cdev *dev, int flags, int fmt, struct thread *td)
static int static int
passclose(struct cdev *dev, int flag, int fmt, struct thread *td) passclose(struct cdev *dev, int flag, int fmt, struct thread *td)
{ {
struct cam_sim *sim;
struct cam_periph *periph; struct cam_periph *periph;
struct pass_softc *softc;
periph = (struct cam_periph *)dev->si_drv1; periph = (struct cam_periph *)dev->si_drv1;
if (periph == NULL) if (periph == NULL)
return (ENXIO); return (ENXIO);
cam_periph_release(periph); sim = periph->sim;
softc = periph->softc;
mtx_lock(sim->mtx);
softc->open_count--;
cam_periph_release_locked(periph);
/*
* We reference the SIM lock directly here, instead of using
* cam_periph_unlock(). The reason is that the call to
* cam_periph_release_locked() above could result in the periph
* getting freed. If that is the case, dereferencing the periph
* with a cam_periph_unlock() call would cause a page fault.
*
* cam_periph_release() avoids this problem using the same method,
* but we're manually acquiring and dropping the lock here to
* protect the open count and avoid another lock acquisition and
* release.
*/
mtx_unlock(sim->mtx);
return (0); return (0);
} }

View File

@ -99,6 +99,7 @@ struct sg_rdwr {
struct sg_softc { struct sg_softc {
sg_state state; sg_state state;
sg_flags flags; sg_flags flags;
int open_count;
struct devstat *device_stats; struct devstat *device_stats;
TAILQ_HEAD(, sg_rdwr) rdwr_done; TAILQ_HEAD(, sg_rdwr) rdwr_done;
struct cdev *dev; struct cdev *dev;
@ -168,6 +169,49 @@ sginit(void)
} }
} }
static void
sgdevgonecb(void *arg)
{
struct cam_sim *sim;
struct cam_periph *periph;
struct sg_softc *softc;
int i;
periph = (struct cam_periph *)arg;
sim = periph->sim;
softc = (struct sg_softc *)periph->softc;
KASSERT(softc->open_count >= 0, ("Negative open count %d",
softc->open_count));
mtx_lock(sim->mtx);
/*
* When we get this callback, we will get no more close calls from
* devfs. So if we have any dangling opens, we need to release the
* reference held for that particular context.
*/
for (i = 0; i < softc->open_count; i++)
cam_periph_release_locked(periph);
softc->open_count = 0;
/*
* Release the reference held for the device node, it is gone now.
*/
cam_periph_release_locked(periph);
/*
* We reference the SIM lock directly here, instead of using
* cam_periph_unlock(). The reason is that the final call to
* cam_periph_release_locked() above could result in the periph
* getting freed. If that is the case, dereferencing the periph
* with a cam_periph_unlock() call would cause a page fault.
*/
mtx_unlock(sim->mtx);
}
static void static void
sgoninvalidate(struct cam_periph *periph) sgoninvalidate(struct cam_periph *periph)
{ {
@ -182,6 +226,12 @@ sgoninvalidate(struct cam_periph *periph)
softc->flags |= SG_FLAG_INVALID; softc->flags |= SG_FLAG_INVALID;
/*
* Tell devfs this device has gone away, and ask for a callback
* when it has cleaned up its state.
*/
destroy_dev_sched_cb(softc->dev, sgdevgonecb, periph);
/* /*
* XXX Return all queued I/O with ENXIO. * XXX Return all queued I/O with ENXIO.
* XXX Handle any transactions queued to the card * XXX Handle any transactions queued to the card
@ -201,10 +251,9 @@ sgcleanup(struct cam_periph *periph)
softc = (struct sg_softc *)periph->softc; softc = (struct sg_softc *)periph->softc;
if (bootverbose) if (bootverbose)
xpt_print(periph->path, "removing device entry\n"); xpt_print(periph->path, "removing device entry\n");
devstat_remove_entry(softc->device_stats); devstat_remove_entry(softc->device_stats);
cam_periph_unlock(periph);
destroy_dev(softc->dev);
cam_periph_lock(periph);
free(softc, M_DEVBUF); free(softc, M_DEVBUF);
} }
@ -299,6 +348,18 @@ sgregister(struct cam_periph *periph, void *arg)
DEVSTAT_TYPE_PASS, DEVSTAT_TYPE_PASS,
DEVSTAT_PRIORITY_PASS); DEVSTAT_PRIORITY_PASS);
/*
* Acquire a reference to the periph before we create the devfs
* instance for it. We'll release this reference once the devfs
* instance has been freed.
*/
if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
xpt_print(periph->path, "%s: lost periph during "
"registration!\n", __func__);
cam_periph_lock(periph);
return (CAM_REQ_CMP_ERR);
}
/* Register the device */ /* Register the device */
softc->dev = make_dev(&sg_cdevsw, periph->unit_number, softc->dev = make_dev(&sg_cdevsw, periph->unit_number,
UID_ROOT, GID_OPERATOR, 0600, "%s%d", UID_ROOT, GID_OPERATOR, 0600, "%s%d",
@ -414,6 +475,8 @@ sgopen(struct cdev *dev, int flags, int fmt, struct thread *td)
return (ENXIO); return (ENXIO);
} }
softc->open_count++;
cam_periph_unlock(periph); cam_periph_unlock(periph);
return (error); return (error);
@ -422,13 +485,36 @@ sgopen(struct cdev *dev, int flags, int fmt, struct thread *td)
static int static int
sgclose(struct cdev *dev, int flag, int fmt, struct thread *td) sgclose(struct cdev *dev, int flag, int fmt, struct thread *td)
{ {
struct cam_sim *sim;
struct cam_periph *periph; struct cam_periph *periph;
struct sg_softc *softc;
periph = (struct cam_periph *)dev->si_drv1; periph = (struct cam_periph *)dev->si_drv1;
if (periph == NULL) if (periph == NULL)
return (ENXIO); return (ENXIO);
cam_periph_release(periph); sim = periph->sim;
softc = periph->softc;
mtx_lock(sim->mtx);
softc->open_count--;
cam_periph_release_locked(periph);
/*
* We reference the SIM lock directly here, instead of using
* cam_periph_unlock(). The reason is that the call to
* cam_periph_release_locked() above could result in the periph
* getting freed. If that is the case, dereferencing the periph
* with a cam_periph_unlock() call would cause a page fault.
*
* cam_periph_release() avoids this problem using the same method,
* but we're manually acquiring and dropping the lock here to
* protect the open count and avoid another lock acquisition and
* release.
*/
mtx_unlock(sim->mtx);
return (0); return (0);
} }