This is:
6483887 without direct management, arc ghost lists can run amok
The fix I have in mind is to control the ghost lists as part of
the arc_buf_hdr_t allocations. If you want to test out my fix,
I can send you some diffs...
-Mark
Juergen Keil wrote:>>J?rgen Keil writes:
>> > > ZFS 11.0 on Solaris release 06/06, hangs systems when
>> > > trying to copy files from my VXFS 4.1 file system.
>> > > any ideas what this problem could be?.
>> >
>> > What kind of system is that? How much memory is installed?
>> >
>> > I''m able to hang an Ultra 60 with 256 MByte of main
memory,
>> > simply by writing big files to a ZFS filesystem. The problem
>> > happens with both Solaris 10 6/2006 and Solaris Express snv_48.
>> >
>> > In my case there seems to be a problem with ZFS'' ARC
cache,
>> > which is not returning memory to the kernel, when free memory
>> > gets low. Instead, ZFS'' ARC cache data structures keeps
growing
>> > until the machine is running out of kernel memory. At this point
>> > the machine hangs, lots of kernel threads are waiting for free
memory,
>> > and the box must be power cycled (Well, unpluging and
re-connecting
>> > the type 5 keyboard works and gets me to the OBP, where I can
force
>> > a system crashdump and reboot).
>>
>>Seems like:
>>
>>6429205 each zpool needs to monitor it''s throughput and
throttle heavy
>
> writers
>
>> (also fixes: 6415647 Sequential writing is jumping)
>
>
> I''m not sure.
>
> My problem on the Ultra 60 with 256 MByte of main memory is this:
>
> I''m trying to setup an "amanda" backup server on that
Ultra 60; it
> receives backups from other amanda client system over the network
> (could be big files up to 150 GBytes) and writes the received data
> to a zpool / zfs on a big 300GB USB HDD.
>
>
> When ~ 25-30 GBytes of data has been written to the zfs filesystem, the
> machine starts gets slower and slower, and starts paging like crazy
> ("pi" high, "sr" high):
>
>
> # vmstat 1
> kthr memory page disk faults cpu
> r b w swap free re mf pi po fr de sr f0 s0 s1 s3 in sy cs us sy
id
> 0 0 0 1653608 38552 17 13 172 38 63 0 1318 0 9 0 1 1177 1269 871 2 10
88
> 0 0 0 1568624 17544 3 179 1310 0 0 0 0 0 125 0 11 11978 3486 1699 11
89 0
> 0 0 0 1568624 6704 0 174 1538 210 241 0 9217 0 118 0 22 11218 3388 1698
11 89
> 0
> 1 0 0 1568624 1568 167 367 1359 2699 7987 0 185763 0 131 0 78 7395 2373
1477 7
> 76 18
> 0 0 0 1568624 5288 119 282 1321 2811 4089 0 183360 0 129 0 47 3161 743
4001 3
> 59 38
> 0 0 0 1568624 15688 41 247 1586 655 648 0 0 0 131 0 40 1637 97 8684 1 31
68
> 1 0 0 1568624 24968 18 214 1600 16 16 0 0 0 129 0 28 1473 75 8685 1 30
69
> 3 0 0 1574232 29032 30 226 1718 0 0 0 0 0 126 0 20 6978 3232 2671 7 60
33
> 1 0 0 1568624 18248 40 314 2354 0 0 0 0 0 119 0 68 11085 4156 2125 12
87 1
> 1 0 0 1568624 7520 43 299 2162 950 1426 0 25595 0 125 0 45 11452 3434
1888 10
> 90 0
> 0 0 0 1568624 3384 201 360 2135 2897 8097 0 195148 0 105 0 58 9129 2956
1650 8
> 92 0
> 2 0 0 1568624 9656 66 241 1791 2036 2655 0 138976 0 157 0 32 2016 243
6349 1
> 51 48
> 0 0 0 1568624 18496 42 289 2900 131 131 0 0 0 150 0 51 1706 104 8640 2 32
66
> 2 0 0 1572416 27688 77 324 1723 46 46 0 0 0 89 0 54 2440 572 6214 3 35
62
> 0 0 0 1570872 24112 19 203 1506 0 0 0 0 0 110 0 27 12193 4103 2013 11
89 0
> 3 0 0 1568624 13760 12 250 2269 0 0 0 0 0 102 0 58 6804 2772 1713 8 51
41
> 2 0 0 1568624 7464 67 283 1779 1336 5188 0 44171 0 98 0 69 9749 3071 1889
9 81
> 11
>
>
> In mdb ::kmastat, I see that a huge number of arc_buf_hdr_t entries are
> allocated. The number of arc_buf_hdr_t entries keeps growing, until the
> kernel runs out of memory and the machines hangs.
>
> ...
> zio_buf_512 512 76 150 81920 39230 0
> zio_buf_1024 1024 6 16 16384 32602 0
> zio_buf_1536 1536 0 5 8192 608 0
> zio_buf_2048 2048 0 4 8192 2246 0
> zio_buf_2560 2560 0 3 8192 777 0
> zio_buf_3072 3072 0 8 24576 913 0
> zio_buf_3584 3584 0 9 32768 26041 0
> zio_buf_4096 4096 3 4 16384 4563 0
> zio_buf_5120 5120 0 8 40960 229 0
> zio_buf_6144 6144 0 4 24576 71 0
> zio_buf_7168 7168 0 8 57344 24 0
> zio_buf_8192 8192 0 2 16384 808 0
> zio_buf_10240 10240 0 4 40960 1083 0
> zio_buf_12288 12288 0 2 24576 1145 0
> zio_buf_14336 14336 0 4 57344 55396 0
> zio_buf_16384 16384 18 19 311296 13957 0
> zio_buf_20480 20480 0 2 40960 878 0
> zio_buf_24576 24576 0 2 49152 69 0
> zio_buf_28672 28672 0 4 114688 104 0
> zio_buf_32768 32768 0 2 65536 310 0
> zio_buf_40960 40960 0 2 81920 152 0
> zio_buf_49152 49152 0 2 98304 215 0
> zio_buf_57344 57344 0 2 114688 335 0
> zio_buf_65536 65536 0 2 131072 742 0
> zio_buf_73728 73728 0 2 147456 433 0
> zio_buf_81920 81920 0 2 163840 412 0
> zio_buf_90112 90112 0 2 180224 634 0
> zio_buf_98304 98304 0 2 196608 1190 0
> zio_buf_106496 106496 0 2 212992 1502 0
> zio_buf_114688 114688 0 2 229376 277544 0
> zio_buf_122880 122880 0 2 245760 2456 0
> zio_buf_131072 131072 357 359 47054848 3046795 0
> dmu_buf_impl_t 328 454 912 311296 2557744 0
> dnode_t 648 91 144 98304 454 0
> arc_buf_hdr_t 128 535823 535878 69681152 2358050 0
<<<<<<
> arc_buf_t 40 382 812 32768 2370019 0
> zil_lwb_cache 208 0 0 0 0 0
> zfs_znode_cache 192 22 126 24576 241 0
> ...
>
>
> At the same time, I see that the ARC target cache size "arc.c"
has
> been reduced to the minimum allowed size: "arc.c == arc.c_min"
>
> # mdb -k
> Loading modules: [ unix krtld genunix specfs dtrace ufs sd ip sctp usba
s1394 fc
> p fctl qlc ssd nca zfs
> random lofs nfs audiosup logindmux ptm md cpc fcip sppp c rypto ipc ]
>
>>arc::print
>
> {
> anon = ARC_anon
> mru = ARC_mru
> mru_ghost = ARC_mru_ghost
> mfu = ARC_mfu
> mfu_ghost = ARC_mfu_ghost
> size = 0x2dec200
> p = 0x9c58c8
> c = 0x4000000
<<<<<<<<<<<<<<<<<<<<<<<
> c_min = 0x4000000
<<<<<<<<<<<<<<<<<<<<<<<
> c_max = 0xb80c800
> hits = 0x650d
> misses = 0xf2
> deleted = 0x3884e
> skipped = 0
> hash_elements = 0x7e83b
> hash_elements_max = 0x7e83b
> hash_collisions = 0xa56ef
> hash_chains = 0x1000
> Segmentation fault (core dumped) <<< That''s another,
unrelated mdb problem
> in S10 6/2006
>
>
>
> Monitoring ::kmastat while writing data to the zfs gives me something
> like this (note how the zio_buf_131072 cache grows and shrinks, but the
> arc_buf_hdr_t cache keeps growing all the time):
>
>
> zio_buf_106496 106496 0 0 0 0 0
> zio_buf_114688 114688 0 2 229376 56 0
> zio_buf_122880 122880 0 0 0 0 0
> zio_buf_131072 131072 587 610 79953920 329670 0
> dmu_buf_impl_t 328 731 984 335872 296319 0
> dnode_t 648 103 168 114688 1469 0
> arc_buf_hdr_t 128 11170 11214 1458176 272535 0
> arc_buf_t 40 642 1015 40960 272560 0
> zil_lwb_cache 208 0 0 0 0 0
> zfs_znode_cache 192 5 42 8192 10 0
> ....
> zio_buf_106496 106496 0 0 0 0 0
> zio_buf_114688 114688 0 2 229376 56 0
> zio_buf_122880 122880 0 0 0 0 0
> zio_buf_131072 131072 644 738 96731136 410864 0
> dmu_buf_impl_t 328 735 984 335872 364413 0
> dnode_t 648 73 168 114688 1472 0
> arc_buf_hdr_t 128 31673 31689 4120576 334357 0
> arc_buf_t 40 677 1015 40960 335571 0
> zil_lwb_cache 208 0 0 0 0 0
> zfs_znode_cache 192 5 42 8192 10 0
> ....
> zio_buf_106496 106496 0 0 0 0 0
> zio_buf_114688 114688 0 2 229376 56 0
> zio_buf_122880 122880 0 0 0 0 0
> zio_buf_131072 131072 566 671 87949312 466149 0
> dmu_buf_impl_t 328 668 984 335872 410679 0
> dnode_t 648 74 168 114688 1483 0
> arc_buf_hdr_t 128 45589 45612 5931008 376351 0
> arc_buf_t 40 609 1015 40960 378393 0
> zil_lwb_cache 208 0 0 0 0 0
> zfs_znode_cache 192 5 42 8192 10 0
> ....
> zio_buf_106496 106496 0 0 0 0 0
> zio_buf_114688 114688 0 2 229376 56 0
> zio_buf_122880 122880 0 0 0 0 0
> zio_buf_131072 131072 536 714 93585408 479158 0
> dmu_buf_impl_t 328 694 984 335872 421623 0
> dnode_t 648 74 168 114688 1483 0
> arc_buf_hdr_t 128 48878 48888 6356992 386276 0
> arc_buf_t 40 635 1015 40960 388509 0
> zil_lwb_cache 208 0 0 0 0 0
> zfs_znode_cache 192 5 42 8192 10 0
> ....
> zio_buf_106496 106496 0 0 0 0 0
> zio_buf_114688 114688 0 2 229376 56 0
> zio_buf_122880 122880 0 0 0 0 0
> zio_buf_131072 131072 738 740 96993280 530363 0
> dmu_buf_impl_t 328 831 984 335872 464574 0
> dnode_t 648 74 168 114688 1483 0
> arc_buf_hdr_t 128 61789 61803 8036352 425230 0
> arc_buf_t 40 771 1015 40960 428244 0
> zil_lwb_cache 208 0 0 0 0 0
> zfs_znode_cache 192 5 42 8192 10 0
> ....
> zio_buf_106496 106496 0 0 0 0 0
> zio_buf_114688 114688 0 2 229376 56 0
> zio_buf_122880 122880 0 0 0 0 0
> zio_buf_131072 131072 585 609 79822848 551540 0
> dmu_buf_impl_t 328 697 984 335872 482245 0
> dnode_t 648 74 168 114688 1483 0
> arc_buf_hdr_t 128 67101 67158 8732672 441277 0
> arc_buf_t 40 638 1015 40960 444617 0
> zil_lwb_cache 208 0 0 0 0 0
> zfs_znode_cache 192 5 42 8192 10 0
> ....
> zio_buf_106496 106496 0 0 0 0 0
> zio_buf_114688 114688 0 2 229376 56 0
> zio_buf_122880 122880 0 0 0 0 0
> zio_buf_131072 131072 714 716 93847552 750232 0
> dmu_buf_impl_t 328 805 984 335872 648794 0
> dnode_t 648 74 168 114688 1485 0
> arc_buf_hdr_t 128 117201 117243 15245312 592453 0
> arc_buf_t 40 746 1015 40960 598751 0
> zil_lwb_cache 208 0 0 0 0 0
> zfs_znode_cache 192 5 42 8192 10 0
> ....
> zio_buf_106496 106496 0 0 0 0 0
> zio_buf_114688 114688 0 2 229376 56 0
> zio_buf_122880 122880 0 0 0 0 0
> zio_buf_131072 131072 703 705 92405760 824503 0
> dmu_buf_impl_t 328 795 984 335872 711001 0
> dnode_t 648 74 168 114688 1485 0
> arc_buf_hdr_t 128 135923 135954 17678336 648924 0
> arc_buf_t 40 735 1015 40960 656282 0
> zil_lwb_cache 208 0 0 0 0 0
> zfs_znode_cache 192 5 42 8192 10 0
> ....
> zio_buf_106496 106496 0 0 0 0 0
> zio_buf_114688 114688 0 2 229376 56 0
> zio_buf_122880 122880 0 0 0 0 0
> zio_buf_131072 131072 671 672 88080384 870370 0
> dmu_buf_impl_t 328 828 984 335872 749488 0
> dnode_t 648 74 168 114688 1485 0
> arc_buf_hdr_t 128 147515 147546 19185664 683917 0
> arc_buf_t 40 777 1015 40960 692025 0
> zil_lwb_cache 208 0 0 0 0 0
> zfs_znode_cache 192 5 42 8192 10 0
> ....
> zio_buf_106496 106496 0 0 0 0 0
> zio_buf_114688 114688 0 2 229376 56 0
> zio_buf_122880 122880 0 0 0 0 0
> zio_buf_131072 131072 676 677 88735744 1002908 0
> dmu_buf_impl_t 328 774 984 335872 860504 0
> dnode_t 648 73 168 114688 1488 0
> arc_buf_hdr_t 128 180874 180936 23527424 784588 0
> arc_buf_t 40 714 1015 40960 794600 0
> zil_lwb_cache 208 0 0 0 0 0
> zfs_znode_cache 192 5 42 8192 10 0
>
>
>
> It seems the problem is that we keep adding arc.mru_ghost / arc.mfu_ghost
> list entries while writing data to zfs, but when the ARC cache is running
> at it''s minimum size, there''s noone checking the ghost
list sizes any more;
> noone is calling arc_evict_ghost() to cleanup the arc ghost lists.
>
>
> arc_evict_ghost() is called from arc_adjust():
>
>
http://cvs.opensolaris.org/source/xref/on/usr/src/uts/common/fs/zfs/arc.c#arc_ad
> just
>
>
> And arc_adjust() is called from arc_kmem_reclaim()...
>
>
http://cvs.opensolaris.org/source/xref/on/usr/src/uts/common/fs/zfs/arc.c#arc_km
> em_reclaim
>
>
> ... but only when "arc.c > arc.c_min":
>
>
http://cvs.opensolaris.org/source/xref/on/usr/src/uts/common/fs/zfs/arc.c#1170
>
> 1170 if (arc.c <= arc.c_min)
> 1171 return;
>
>
> When arc.c <= arc.c_min: no more arc_adjust() calls, and no more
> arc_evict_ghost() calls.
>
>
==============================================================================>
> I''m currently experimenting with an arc_kmem_reclaim() changed
like this,
> which seems to work fine so far, no more hangs:
>
>
> void
> arc_kmem_reclaim(void)
> {
> uint64_t to_free;
>
> /*
> * We need arc_reclaim_lock because we don''t want multiple
> * threads trying to reclaim concurrently.
> */
>
> /*
> * umem calls the reclaim func when we destroy the buf cache,
> * which is after we do arc_fini(). So we set a flag to prevent
> * accessing the destroyed mutexes and lists.
> */
> if (arc_dead)
> return;
>
> mutex_enter(&arc_reclaim_lock);
>
> if (arc.c > arc.c_min) {
> #ifdef _KERNEL
> to_free = MAX(arc.c >> arc_kmem_reclaim_shift,
ptob(needfree));
> #else
> to_free = arc.c >> arc_kmem_reclaim_shift;
> #endif
> if (arc.c > to_free)
> atomic_add_64(&arc.c, -to_free);
> else
> arc.c = arc.c_min;
>
> atomic_add_64(&arc.p, -(arc.p >>
arc_kmem_reclaim_shift));
> if (arc.c > arc.size)
> arc.c = arc.size;
> if (arc.c < arc.c_min)
> arc.c = arc.c_min;
> if (arc.p > arc.c)
> arc.p = (arc.c >> 1);
> ASSERT((int64_t)arc.p >= 0);
> }
>
> arc_adjust();
>
> mutex_exit(&arc_reclaim_lock);
> }
>
>
>
=============================================================================>
>
> I''m able to reproduce the issue with a test program like the one
included
> below. Run it with a current directory on a zfs filesystem, and on a
> machine with only 256 MByte of main memory.
>
>
> /*
> * gcc `getconf LFS_CFLAGS` fill.c -o fill
> */
> #include <stdlib.h>
> #include <stdio.h>
> #include <unistd.h>
> #include <fcntl.h>
>
> int
> main(int argc, char **argv)
> {
> char buf[32*1024];
> int i, n;
> int fd;
>
> fd = open("/dev/random", O_RDONLY);
> if (fd < 0) {
> perror("/dev/random");
> memset(buf, ''*'', sizeof(buf));
> } else {
> for (n = 0; n < sizeof(buf); n += i) {
> i = read(fd, buf+n, sizeof(buf)-n);
> if (i < 0) {
> perror("read random data");
> exit(1);
> }
> if (i == 0) {
> fprintf(stderr, "EOF reading random
data\n");
> exit(1);
> }
> }
> close(fd);
> }
> fd = creat("junk", 0666);
> if (fd < 0) {
> perror("create junk file");
> exit(1);
> }
> for (;;) {
> if (write(fd, buf, sizeof(buf)) != sizeof(buf)) {
> perror("write data");
> break;
> }
> }
> close(fd);
> exit(0);
> }
>
> _______________________________________________
> zfs-discuss mailing list
> zfs-discuss at opensolaris.org
> http://mail.opensolaris.org/mailman/listinfo/zfs-discuss