On Fri, Dec 22, 2017 at 10:26:07AM +0100, Peter Holm
wrote:> Here's some more info, using the original scenario:
> https://people.freebsd.org/~pho/stress/log/kostik1070.txt
This is somewhat weird but also not too puzzling.
The vmdaemon (pid 41) is running, it tries to reduce the count of resident
pages in some pmap, most likely the one from the pid 20655. This process
seems to be huge: according to the v_stats, there is 15681264 inactive pages,
and the pagedaemon tries to obtain a vm object lock which is owned by
vmdaemon, resident count for that object is 15897170 (~64Gb).
So basically almost all memory belongs to the single object and vmdaemon
processing it. Since the object' queue is huge, the map and the object
locks are taken for long time, preventing other processes touching them
from making a progress.
Might be try this (it combines new changes with the OOM patch). I am not
sure that should_yield() in the vm_swapout_object_deactivate_pages() is
a good idea unconditionally, but it might be better than the current
situation.
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index ece496407c2..ce6208569c6 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -134,6 +134,16 @@ static void vm_fault_dontneed(const struct faultstate *fs,
vm_offset_t vaddr,
static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
int backward, int forward);
+static int vm_pfault_oom_attempts = 3;
+SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN,
+ &vm_pfault_oom_attempts, 0,
+ "");
+
+static int vm_pfault_oom_wait = 10;
+SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN,
+ &vm_pfault_oom_wait, 0,
+ "");
+
static inline void
release_page(struct faultstate *fs)
{
@@ -531,7 +541,7 @@ vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t
fault_type,
vm_pindex_t retry_pindex;
vm_prot_t prot, retry_prot;
int ahead, alloc_req, behind, cluster_offset, error, era, faultcount;
- int locked, nera, result, rv;
+ int locked, nera, oom, result, rv;
u_char behavior;
boolean_t wired; /* Passed by reference. */
bool dead, hardfault, is_first_object_locked;
@@ -542,7 +552,9 @@ vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t
fault_type,
nera = -1;
hardfault = false;
-RetryFault:;
+RetryFault:
+ oom = 0;
+RetryFault_oom:
/*
* Find the backing store object and offset into it to begin the
@@ -787,7 +799,17 @@ RetryFault:;
}
if (fs.m == NULL) {
unlock_and_deallocate(&fs);
- VM_WAITPFAULT;
+ if (vm_pfault_oom_attempts < 0 ||
+ oom < vm_pfault_oom_attempts) {
+ oom++;
+ vm_waitpfault(vm_pfault_oom_wait);
+ goto RetryFault_oom;
+ }
+ if (bootverbose)
+ printf(
+ "proc %d (%s) failed to alloc page on fault, starting OOM\n",
+ curproc->p_pid, curproc->p_comm);
+ vm_pageout_oom(VM_OOM_MEM_PF);
goto RetryFault;
}
}
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 0397dfef457..7d403497550 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -2652,7 +2652,7 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t
low, vm_paddr_t high,
* - Called in various places before memory allocations.
*/
static void
-_vm_wait(void)
+_vm_wait(int timo)
{
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
@@ -2669,16 +2669,16 @@ _vm_wait(void)
}
vm_pages_needed = true;
msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
- "vmwait", 0);
+ "vmwait", timo * hz);
}
}
void
-vm_wait(void)
+vm_wait(int timo)
{
mtx_lock(&vm_page_queue_free_mtx);
- _vm_wait();
+ _vm_wait(timo);
}
/*
@@ -2703,7 +2703,7 @@ vm_page_alloc_fail(vm_object_t object, int req)
if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) {
if (object != NULL)
VM_OBJECT_WUNLOCK(object);
- _vm_wait();
+ _vm_wait(0);
if (object != NULL)
VM_OBJECT_WLOCK(object);
if (req & VM_ALLOC_WAITOK)
@@ -2724,7 +2724,7 @@ vm_page_alloc_fail(vm_object_t object, int req)
* this balance without careful testing first.
*/
void
-vm_waitpfault(void)
+vm_waitpfault(int timo)
{
mtx_lock(&vm_page_queue_free_mtx);
@@ -2734,7 +2734,7 @@ vm_waitpfault(void)
}
vm_pages_needed = true;
msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
- "pfault", 0);
+ "pfault", timo * hz);
}
struct vm_pagequeue *
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 4d8f8be894c..d8289b1182e 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -1648,6 +1648,13 @@ vm_pageout_oom_pagecount(struct vmspace *vmspace)
return (res);
}
+static int vm_oom_ratelim_count;
+static int vm_oom_ratelim_last;
+static int vm_oom_pf_secs = 10;
+SYSCTL_INT(_vm, OID_AUTO, oom_pf_secs, CTLFLAG_RWTUN, &vm_oom_pf_secs, 0,
+ "");
+static struct mtx vm_oom_ratelim_mtx;
+
void
vm_pageout_oom(int shortage)
{
@@ -1655,8 +1662,31 @@ vm_pageout_oom(int shortage)
vm_offset_t size, bigsize;
struct thread *td;
struct vmspace *vm;
+ int now;
bool breakout;
+ /*
+ * For OOM requests originating from vm_fault(), there is a high
+ * chance that a single large process faults simultaneously in
+ * several threads. Also, on an active system running many
+ * processes of middle-size, like buildworld, all of them
+ * could fault almost simultaneously as well.
+ *
+ * To avoid killing too many processes, rate-limit OOMs
+ * initiated by vm_fault() time-outs on the waits for free
+ * pages.
+ */
+ mtx_lock(&vm_oom_ratelim_mtx);
+ now = ticks;
+ if ((u_int)(now - vm_oom_ratelim_last) >= hz * vm_oom_pf_secs) {
+ vm_oom_ratelim_last = now;
+ vm_oom_ratelim_count = 0;
+ } else if (vm_oom_ratelim_count++ > 0 && shortage == VM_OOM_MEM_PF)
{
+ mtx_unlock(&vm_oom_ratelim_mtx);
+ return;
+ }
+ mtx_unlock(&vm_oom_ratelim_mtx);
+
/*
* We keep the process bigproc locked once we find it to keep anyone
* from messing with it; however, there is a possibility of
@@ -1721,7 +1751,7 @@ vm_pageout_oom(int shortage)
continue;
}
size = vmspace_swap_count(vm);
- if (shortage == VM_OOM_MEM)
+ if (shortage == VM_OOM_MEM || shortage == VM_OOM_MEM_PF)
size += vm_pageout_oom_pagecount(vm);
vm_map_unlock_read(&vm->vm_map);
vmspace_free(vm);
@@ -1917,6 +1947,7 @@ vm_pageout(void)
int i;
#endif
+ mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF);
swap_pager_swap_init();
error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL,
0, 0, "laundry: dom0");
diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h
index 2cdb1492fab..7b1a61b3a82 100644
--- a/sys/vm/vm_pageout.h
+++ b/sys/vm/vm_pageout.h
@@ -80,7 +80,8 @@ extern bool vm_pageout_wanted;
extern bool vm_pages_needed;
#define VM_OOM_MEM 1
-#define VM_OOM_SWAPZ 2
+#define VM_OOM_MEM_PF 2
+#define VM_OOM_SWAPZ 3
/*
* vm_lowmem flags.
@@ -96,11 +97,10 @@ extern bool vm_pages_needed;
* Signal pageout-daemon and wait for it.
*/
-extern void pagedaemon_wakeup(void);
-#define VM_WAIT vm_wait()
-#define VM_WAITPFAULT vm_waitpfault()
-extern void vm_wait(void);
-extern void vm_waitpfault(void);
+void pagedaemon_wakeup(void);
+#define VM_WAIT vm_wait(0)
+void vm_wait(int timo);
+void vm_waitpfault(int timo);
#ifdef _KERNEL
int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *);
diff --git a/sys/vm/vm_swapout.c b/sys/vm/vm_swapout.c
index e3e8b8966d7..19f02b56b5e 100644
--- a/sys/vm/vm_swapout.c
+++ b/sys/vm/vm_swapout.c
@@ -152,6 +152,11 @@ SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
&swap_idle_threshold2, 0,
"Time before a process will be swapped out");
+static int swapper_swapin_oom_timeout = 1;
+SYSCTL_INT(_vm, OID_AUTO, swapper_swapin_oom_timeout, CTLFLAG_RW,
+ &swapper_swapin_oom_timeout, 0,
+ "Interval for swapper to try faultin killed processes on OOM");
+
static int vm_pageout_req_swapout; /* XXX */
static int vm_daemon_needed;
static struct mtx vm_daemon_mtx;
@@ -164,7 +169,7 @@ static void vm_swapout_map_deactivate_pages(vm_map_t, long);
static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long);
static void swapout_procs(int action);
static void vm_req_vmdaemon(int req);
-static void vm_thread_swapin(struct thread *td);
+static void vm_thread_swapin(struct thread *td, bool oom_swapin);
static void vm_thread_swapout(struct thread *td);
/*
@@ -203,6 +208,8 @@ vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t
first_object,
TAILQ_FOREACH(p, &object->memq, listq) {
if (pmap_resident_count(pmap) <= desired)
goto unlock_return;
+ if (should_yield())
+ goto unlock_return;
if (vm_page_busied(p))
continue;
VM_CNT_INC(v_pdpages);
@@ -263,7 +270,7 @@ vm_swapout_map_deactivate_pages(vm_map_t map, long desired)
vm_object_t obj, bigobj;
int nothingwired;
- if (!vm_map_trylock(map))
+ if (!vm_map_trylock_read(map))
return;
bigobj = NULL;
@@ -327,7 +334,7 @@ vm_swapout_map_deactivate_pages(vm_map_t map, long desired)
vm_map_max(map));
}
- vm_map_unlock(map);
+ vm_map_unlock_read(map);
}
/*
@@ -516,8 +523,10 @@ again:
PRELE(p);
}
sx_sunlock(&allproc_lock);
- if (tryagain != 0 && attempts <= 10)
+ if (tryagain != 0 && attempts <= 10) {
+ maybe_yield();
goto again;
+ }
}
}
@@ -552,20 +561,18 @@ vm_thread_swapout(struct thread *td)
* Bring the kernel stack for a specified thread back in.
*/
static void
-vm_thread_swapin(struct thread *td)
+vm_thread_swapin(struct thread *td, bool oom_swapin)
{
vm_object_t ksobj;
vm_page_t ma[KSTACK_MAX_PAGES];
- int pages;
+ int j, a, count, pages, rv;
pages = td->td_kstack_pages;
ksobj = td->td_kstack_obj;
VM_OBJECT_WLOCK(ksobj);
- (void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED, ma,
- pages);
+ (void)vm_page_grab_pages(ksobj, 0, (oom_swapin ? VM_ALLOC_SYSTEM :
+ VM_ALLOC_NORMAL) | VM_ALLOC_WIRED, ma, pages);
for (int i = 0; i < pages;) {
- int j, a, count, rv;
-
vm_page_assert_xbusied(ma[i]);
if (ma[i]->valid == VM_PAGE_BITS_ALL) {
vm_page_xunbusy(ma[i]);
@@ -592,8 +599,8 @@ vm_thread_swapin(struct thread *td)
cpu_thread_swapin(td);
}
-void
-faultin(struct proc *p)
+static void
+faultin1(struct proc *p, bool oom_swapin)
{
struct thread *td;
@@ -622,7 +629,7 @@ faultin(struct proc *p)
* swapped out.
*/
FOREACH_THREAD_IN_PROC(p, td)
- vm_thread_swapin(td);
+ vm_thread_swapin(td, oom_swapin);
PROC_LOCK(p);
swapclear(p);
p->p_swtick = ticks;
@@ -634,6 +641,13 @@ faultin(struct proc *p)
}
}
+void
+faultin(struct proc *p)
+{
+
+ faultin1(p, false);
+}
+
/*
* This swapin algorithm attempts to swap-in processes only if there
* is enough space for them. Of course, if a process waits for a long
@@ -645,14 +659,38 @@ swapper(void)
struct proc *p;
struct thread *td;
struct proc *pp;
- int slptime;
- int swtime;
- int ppri;
- int pri;
+ int ppri, pri, slptime, swtime;
loop:
if (vm_page_count_min()) {
- VM_WAIT;
+ /*
+ * We are low on memory. A swapped-out process might
+ * have mapped a large portion of the system's pages
+ * as anonymous memory. There is no other way to
+ * release the memory other than to kill the process,
+ * for which we need to swap it in.
+ */
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ /*
+ * Ensure that the pages for kernel stacks are
+ * allocated with higher priority.
+ */
+ if (p->p_state == PRS_NORMAL && (p->p_flag &
+ (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM |
+ P_WKILLED)) == P_WKILLED) {
+ sx_sunlock(&allproc_lock);
+ faultin1(p, true);
+ PROC_UNLOCK(p);
+ goto loop;
+ }
+
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+
+ vm_wait(swapper_swapin_oom_timeout * hz);
goto loop;
}
@@ -715,8 +753,7 @@ loop:
}
/*
- * We would like to bring someone in. (only if there is space).
- * [What checks the space? ]
+ * We would like to bring someone in.
*/
faultin(p);
PROC_UNLOCK(p);