aboutsummaryrefslogtreecommitdiff
path: root/sbin
diff options
context:
space:
mode:
authorMikolaj Golub <trociny@FreeBSD.org>2013-12-10 19:56:26 +0000
committerMikolaj Golub <trociny@FreeBSD.org>2013-12-10 19:56:26 +0000
commit5d69ed535e3960bf3f24d4f19dc6f740bbd6c60b (patch)
tree74af86cdc30e1789cc81e7449547d7516fdcf81c /sbin
parent28bb97ddca72eb0c228dbe1f6cf20bc06dfe9af6 (diff)
downloadsrc-5d69ed535e3960bf3f24d4f19dc6f740bbd6c60b.tar.gz
src-5d69ed535e3960bf3f24d4f19dc6f740bbd6c60b.zip
For memsync replication, hio_countdown is used not only as an
indication when a request can be moved to done queue, but also for detecting the current state of memsync request. This approach has problems, e.g. leaking a request if memsynk ack from the secondary failed, or racy usage of write_complete, which should be called only once per write request, but for memsync can be entered by local_send_thread and ggate_send_thread simultaneously. So the following approach is implemented instead: 1) Use hio_countdown only for counting components we waiting to complete, i.e. initially it is always 2 for any replication mode. 2) To distinguish between "memsync ack" and "memsync fin" responses from the secondary, add and use hio_memsyncacked field. 3) write_complete() in component threads is called only before releasing hio_countdown (i.e. before the hio may be returned to the done queue). 4) Add and use hio_writecount refcounter to detect when write_complete() can be called in memsync case. Reported by: Pete French petefrench ingresso.co.uk Tested by: Pete French petefrench ingresso.co.uk MFC after: 2 weeks
Notes
Notes: svn path=/head/; revision=259191
Diffstat (limited to 'sbin')
-rw-r--r--sbin/hastd/primary.c158
1 files changed, 47 insertions, 111 deletions
diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c
index 039eae318646..9c3645f6faee 100644
--- a/sbin/hastd/primary.c
+++ b/sbin/hastd/primary.c
@@ -94,6 +94,15 @@ struct hio {
*/
bool hio_done;
/*
+ * Number of components we are still waiting before sending write
+ * completion ack to GEOM Gate. Used for memsync.
+ */
+ refcnt_t hio_writecount;
+ /*
+ * Memsync request was acknowleged by remote.
+ */
+ bool hio_memsyncacked;
+ /*
* Remember replication from the time the request was initiated,
* so we won't get confused when replication changes on reload.
*/
@@ -231,6 +240,9 @@ static pthread_mutex_t metadata_lock;
#define ISSYNCREQ(hio) ((hio)->hio_ggio.gctl_unit == -1)
#define SYNCREQDONE(hio) do { (hio)->hio_ggio.gctl_unit = -2; } while (0)
#define ISSYNCREQDONE(hio) ((hio)->hio_ggio.gctl_unit == -2)
+#define ISMEMSYNCWRITE(hio) \
+ (((hio)->hio_replication == HAST_REPLICATION_MEMSYNC && \
+ (hio)->hio_ggio.gctl_cmd == BIO_WRITE && !ISSYNCREQ(hio)))
static struct hast_resource *gres;
@@ -1344,6 +1356,10 @@ ggate_recv_thread(void *arg)
} else {
mtx_unlock(&res->hr_amp_lock);
}
+ if (hio->hio_replication == HAST_REPLICATION_MEMSYNC) {
+ hio->hio_memsyncacked = false;
+ refcnt_init(&hio->hio_writecount, ncomps);
+ }
break;
case BIO_DELETE:
res->hr_stat_delete++;
@@ -1354,13 +1370,7 @@ ggate_recv_thread(void *arg)
}
pjdlog_debug(2,
"ggate_recv: (%p) Moving request to the send queues.", hio);
- if (hio->hio_replication == HAST_REPLICATION_MEMSYNC &&
- ggio->gctl_cmd == BIO_WRITE) {
- /* Each remote request needs two responses in memsync. */
- refcnt_init(&hio->hio_countdown, ncomps + 1);
- } else {
- refcnt_init(&hio->hio_countdown, ncomps);
- }
+ refcnt_init(&hio->hio_countdown, ncomps);
for (ii = ncomp; ii < ncomps; ii++)
QUEUE_INSERT1(hio, send, ii);
}
@@ -1470,42 +1480,13 @@ local_send_thread(void *arg)
}
break;
}
-
- if (hio->hio_replication != HAST_REPLICATION_MEMSYNC ||
- ggio->gctl_cmd != BIO_WRITE || ISSYNCREQ(hio)) {
- if (refcnt_release(&hio->hio_countdown) > 0)
- continue;
- } else {
- /*
- * Depending on hio_countdown value, requests finished
- * in the following order:
- * 0: remote memsync, remote final, local write
- * 1: remote memsync, local write, (remote final)
- * 2: local write, (remote memsync), (remote final)
- */
- switch (refcnt_release(&hio->hio_countdown)) {
- case 0:
- /*
- * Local write finished as last.
- */
- break;
- case 1:
- /*
- * Local write finished after remote memsync
- * reply arrvied. We can complete the write now.
- */
- if (hio->hio_errors[0] == 0)
- write_complete(res, hio);
- continue;
- case 2:
- /*
- * Local write finished as first.
- */
- continue;
- default:
- PJDLOG_ABORT("Invalid hio_countdown.");
+ if (ISMEMSYNCWRITE(hio)) {
+ if (refcnt_release(&hio->hio_writecount) == 0) {
+ write_complete(res, hio);
}
}
+ if (refcnt_release(&hio->hio_countdown) > 0)
+ continue;
if (ISSYNCREQ(hio)) {
mtx_lock(&sync_lock);
SYNCREQDONE(hio);
@@ -1627,10 +1608,8 @@ remote_send_thread(void *arg)
nv_add_uint64(nv, (uint64_t)ggio->gctl_seq, "seq");
nv_add_uint64(nv, offset, "offset");
nv_add_uint64(nv, length, "length");
- if (hio->hio_replication == HAST_REPLICATION_MEMSYNC &&
- ggio->gctl_cmd == BIO_WRITE && !ISSYNCREQ(hio)) {
+ if (ISMEMSYNCWRITE(hio))
nv_add_uint8(nv, 1, "memsync");
- }
if (nv_error(nv) != 0) {
hio->hio_errors[ncomp] = nv_error(nv);
pjdlog_debug(2,
@@ -1709,8 +1688,12 @@ done_queue:
} else {
mtx_unlock(&res->hr_amp_lock);
}
- if (hio->hio_replication == HAST_REPLICATION_MEMSYNC)
- (void)refcnt_release(&hio->hio_countdown);
+ if (ISMEMSYNCWRITE(hio)) {
+ if (refcnt_release(&hio->hio_writecount) == 0) {
+ if (hio->hio_errors[0] == 0)
+ write_complete(res, hio);
+ }
+ }
}
if (refcnt_release(&hio->hio_countdown) > 0)
continue;
@@ -1768,6 +1751,7 @@ remote_recv_thread(void *arg)
hio_next[ncomp]);
hio_recv_list_size[ncomp]--;
mtx_unlock(&hio_recv_list_lock[ncomp]);
+ hio->hio_errors[ncomp] = ENOTCONN;
goto done_queue;
}
if (hast_proto_recv_hdr(res->hr_remotein, &nv) == -1) {
@@ -1841,82 +1825,34 @@ remote_recv_thread(void *arg)
hio->hio_errors[ncomp] = 0;
nv_free(nv);
done_queue:
- if (hio->hio_replication != HAST_REPLICATION_MEMSYNC ||
- hio->hio_ggio.gctl_cmd != BIO_WRITE || ISSYNCREQ(hio)) {
- if (refcnt_release(&hio->hio_countdown) > 0)
- continue;
- } else {
- /*
- * Depending on hio_countdown value, requests finished
- * in the following order:
- *
- * 0: local write, remote memsync, remote final
- * or
- * 0: remote memsync, local write, remote final
- *
- * 1: local write, remote memsync, (remote final)
- * or
- * 1: remote memsync, remote final, (local write)
- *
- * 2: remote memsync, (local write), (remote final)
- * or
- * 2: remote memsync, (remote final), (local write)
- */
- switch (refcnt_release(&hio->hio_countdown)) {
- case 0:
- /*
- * Remote final reply arrived.
- */
- PJDLOG_ASSERT(!memsyncack);
- break;
- case 1:
- if (memsyncack) {
- /*
- * Local request already finished, so we
- * can complete the write.
- */
+ if (ISMEMSYNCWRITE(hio)) {
+ if (!hio->hio_memsyncacked) {
+ PJDLOG_ASSERT(memsyncack ||
+ hio->hio_errors[ncomp] != 0);
+ /* Remote ack arrived. */
+ if (refcnt_release(&hio->hio_writecount) == 0) {
if (hio->hio_errors[0] == 0)
write_complete(res, hio);
- /*
- * We still need to wait for final
- * remote reply.
- */
+ }
+ hio->hio_memsyncacked = true;
+ if (hio->hio_errors[ncomp] == 0) {
pjdlog_debug(2,
- "remote_recv: (%p) Moving request back to the recv queue.",
- hio);
+ "remote_recv: (%p) Moving request "
+ "back to the recv queue.", hio);
mtx_lock(&hio_recv_list_lock[ncomp]);
TAILQ_INSERT_TAIL(&hio_recv_list[ncomp],
hio, hio_next[ncomp]);
hio_recv_list_size[ncomp]++;
mtx_unlock(&hio_recv_list_lock[ncomp]);
- } else {
- /*
- * Remote final reply arrived before
- * local write finished.
- * Nothing to do in such case.
- */
+ continue;
}
- continue;
- case 2:
- /*
- * We received remote memsync reply even before
- * local write finished.
- */
- PJDLOG_ASSERT(memsyncack);
-
- pjdlog_debug(2,
- "remote_recv: (%p) Moving request back to the recv queue.",
- hio);
- mtx_lock(&hio_recv_list_lock[ncomp]);
- TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio,
- hio_next[ncomp]);
- hio_recv_list_size[ncomp]++;
- mtx_unlock(&hio_recv_list_lock[ncomp]);
- continue;
- default:
- PJDLOG_ABORT("Invalid hio_countdown.");
+ } else {
+ PJDLOG_ASSERT(!memsyncack);
+ /* Remote final reply arrived. */
}
}
+ if (refcnt_release(&hio->hio_countdown) > 0)
+ continue;
if (ISSYNCREQ(hio)) {
mtx_lock(&sync_lock);
SYNCREQDONE(hio);