Atin Mukherjee
2015-Aug-10 16:52 UTC
[Gluster-users] volume not working after yum update - gluster 3.6.3
-Atin Sent from one plus one On Aug 10, 2015 9:47 PM, "Kingsley" <gluster at gluster.dogwind.com> wrote:> > On Mon, 2015-08-10 at 21:39 +0530, Atin Mukherjee wrote: >> >> -Atin >> Sent from one plus one >> On Aug 10, 2015 9:37 PM, "Kingsley" <gluster at gluster.dogwind.com> wrote: >> > >> > On Mon, 2015-08-10 at 21:34 +0530, Atin Mukherjee wrote: >> > > -Atin >> > > Sent from one plus one >> > > On Aug 10, 2015 7:19 PM, "Kingsley" <gluster at gluster.dogwind.com> >> > > wrote: >> > > > >> > > > Further to this, the volume doesn't seem overly healthy. Any idea >> > > how I >> > > > can get it back into a working state? >> > > > >> > > > Trying to access one particular directory on the clients justhangs.>> > > If >> > > > I query heal info, that directory appears in the output as possibly >> > > > undergoing heal (actual directory name changed as it's private >> > > info): >> > > Can you execute strace and see which call is stuck? That would helpus>> > > to get to the exact component which we would need to look at. >> > >> > Hi, >> > >> > I've never used strace before. Could you give me the command line to >> > type? >> Just type strace followed by the command > > > Is this what you meant (I renamed the broken directory so that I couldcreate another and let the system continue to work with a freshly created one). It ran very quickly and returned be back to the command prompt, but I then "cd"d into that directory and did a plain "ls" which then hung, ie:> > --8<-- > [root at voicemail1b-1 14391.broken]# ls > ^Z > > > > fg > > --8<-- > > Anyway, the strace: > > [root at voicemail1b-1 834723]# strace ls 14391.broken > execve("/usr/bin/ls", ["ls", "14391.broken"], [/* 27 vars */]) = 0 > brk(0) = 0x158c000 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f7d2494c000> access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file ordirectory)> open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3 > fstat(3, {st_mode=S_IFREG|0644, st_size=31874, ...}) = 0 > mmap(NULL, 31874, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f7d24944000 > close(3) = 0 > open("/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240d\0\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=147120, ...}) = 0 > mmap(NULL, 2246784, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f7d24509000> mprotect(0x7f7d2452a000, 2097152, PROT_NONE) = 0 > mmap(0x7f7d2472a000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x21000) = 0x7f7d2472a000> mmap(0x7f7d2472c000, 6272, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f7d2472c000> close(3) = 0 > open("/lib64/libcap.so.2", O_RDONLY|O_CLOEXEC) = 3 > read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\26\0\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=20024, ...}) = 0 > mmap(NULL, 2114112, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f7d24304000> mprotect(0x7f7d24308000, 2093056, PROT_NONE) = 0 > mmap(0x7f7d24507000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3000) = 0x7f7d24507000> close(3) = 0 > open("/lib64/libacl.so.1", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\200\37\0\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=37056, ...}) = 0 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f7d24943000> mmap(NULL, 2130560, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f7d240fb000> mprotect(0x7f7d24102000, 2097152, PROT_NONE) = 0 > mmap(0x7f7d24302000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x7000) = 0x7f7d24302000> close(3) = 0 > open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\34\2\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=2107760, ...}) = 0 > mmap(NULL, 3932736, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f7d23d3a000> mprotect(0x7f7d23ef0000, 2097152, PROT_NONE) = 0 > mmap(0x7f7d240f0000, 24576, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1b6000) = 0x7f7d240f0000> mmap(0x7f7d240f6000, 16960, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f7d240f6000> close(3) = 0 > open("/lib64/libpcre.so.1", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\360\25\0\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=398272, ...}) = 0 > mmap(NULL, 2490888, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f7d23ad9000> mprotect(0x7f7d23b38000, 2097152, PROT_NONE) = 0 > mmap(0x7f7d23d38000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x5f000) = 0x7f7d23d38000> close(3) = 0 > open("/lib64/liblzma.so.5", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0000/\0\0\0\0\0\0"..., 832) 832> fstat(3, {st_mode=S_IFREG|0755, st_size=153184, ...}) = 0 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f7d24942000> mmap(NULL, 2245240, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f7d238b4000> mprotect(0x7f7d238d8000, 2093056, PROT_NONE) = 0 > mmap(0x7f7d23ad7000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x23000) = 0x7f7d23ad7000> close(3) = 0 > open("/lib64/libdl.so.2", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\16\0\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=19512, ...}) = 0 > mmap(NULL, 2109744, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f7d236b0000> mprotect(0x7f7d236b3000, 2093056, PROT_NONE) = 0 > mmap(0x7f7d238b2000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x7f7d238b2000> close(3) = 0 > open("/lib64/libattr.so.1", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\23\0\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=19888, ...}) = 0 > mmap(NULL, 2113904, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f7d234ab000> mprotect(0x7f7d234af000, 2093056, PROT_NONE) = 0 > mmap(0x7f7d236ae000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3000) = 0x7f7d236ae000> close(3) = 0 > open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240l\0\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=141616, ...}) = 0 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f7d24941000> mmap(NULL, 2208864, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f7d2328f000> mprotect(0x7f7d232a5000, 2097152, PROT_NONE) = 0 > mmap(0x7f7d234a5000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x16000) = 0x7f7d234a5000> mmap(0x7f7d234a7000, 13408, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f7d234a7000> close(3) = 0 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f7d24940000> mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f7d2493e000> arch_prctl(ARCH_SET_FS, 0x7f7d2493e800) = 0 > mprotect(0x7f7d240f0000, 16384, PROT_READ) = 0 > mprotect(0x7f7d234a5000, 4096, PROT_READ) = 0 > mprotect(0x7f7d236ae000, 4096, PROT_READ) = 0 > mprotect(0x7f7d238b2000, 4096, PROT_READ) = 0 > mprotect(0x7f7d23ad7000, 4096, PROT_READ) = 0 > mprotect(0x7f7d23d38000, 4096, PROT_READ) = 0 > mprotect(0x7f7d24302000, 4096, PROT_READ) = 0 > mprotect(0x7f7d24507000, 4096, PROT_READ) = 0 > mprotect(0x7f7d2472a000, 4096, PROT_READ) = 0 > mprotect(0x61a000, 4096, PROT_READ) = 0 > mprotect(0x7f7d2494f000, 4096, PROT_READ) = 0 > munmap(0x7f7d24944000, 31874) = 0 > set_tid_address(0x7f7d2493ead0) = 17906 > set_robust_list(0x7f7d2493eae0, 24) = 0 > rt_sigaction(SIGRTMIN, {0x7f7d23295780, [], SA_RESTORER|SA_SIGINFO,0x7f7d2329e130}, NULL, 8) = 0> rt_sigaction(SIGRT_1, {0x7f7d23295810, [],SA_RESTORER|SA_RESTART|SA_SIGINFO, 0x7f7d2329e130}, NULL, 8) = 0> rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0 > getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) 0 > statfs("/sys/fs/selinux", {f_type=0xf97cff8c, f_bsize=4096, f_blocks=0,f_bfree=0, f_bavail=0, f_files=0, f_ffree=0, f_fsid={0, 0}, f_namelen=255, f_frsize=4096}) = 0> statfs("/sys/fs/selinux", {f_type=0xf97cff8c, f_bsize=4096, f_blocks=0,f_bfree=0, f_bavail=0, f_files=0, f_ffree=0, f_fsid={0, 0}, f_namelen=255, f_frsize=4096}) = 0> stat("/sys/fs/selinux", {st_mode=S_IFDIR|0755, st_size=0, ...}) = 0 > brk(0) = 0x158c000 > brk(0x15ad000) = 0x15ad000 > ioctl(1, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS,{B38400 opost isig icanon echo ...}) = 0> ioctl(1, TIOCGWINSZ, {ws_row=41, ws_col=202, ws_xpixel=0, ws_ypixel=0}) 0 > stat("14391.broken", {st_mode=S_IFDIR|0755, st_size=8192, ...}) = 0 > openat(AT_FDCWD, "14391.broken",O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC) = 3> getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 23 entries */, 32768) = 552 > getdents(3, /* 19 entries */, 32768) = 456 > getdents(3, /* 0 entries */, 32768) = 0 > close(3) = 0 > fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 3), ...}) = 0 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f7d2494b000> write(1, "012 033 046 063 076 087 09"..., 195012 033 046 063076 087 096 104 112 120 128 136 144 152 160 172 180 195 209 225 235 246 258 279 298 313 343 389 628 900 908 918 926 934 942 950 958 968 980 994> ) = 195 > write(1, "013 034 049 065 079 088 09"..., 195013 034 049 065079 088 097 105 113 121 129 137 145 153 161 173 184 196 212 226 236 247 266 281 299 314 348 394 843 901 910 919 927 935 943 951 959 970 981 996> ) = 195 > write(1, "014 035 050 066 080 089 09"..., 195014 035 050 066080 089 098 106 114 122 130 138 146 154 162 174 185 197 215 227 237 248 267 288 301 317 349 396 869 902 911 920 928 936 944 952 960 972 982 997> ) = 195 > write(1, "016 039 052 071 081 090 09"..., 195016 039 052 071081 090 099 107 115 123 131 139 147 155 163 175 186 198 219 229 238 250 269 291 305 321 350 405 882 903 912 921 929 937 945 953 961 973 984 998> ) = 195 > write(1, "018 041 055 072 082 091 10"..., 190018 041 055 072082 091 100 108 116 124 132 140 148 156 164 176 187 203 221 230 239 251 270 292 306 328 354 407 890 904 914 922 930 938 946 954 962 974 985> ) = 190 > write(1, "019 042 057 073 084 092 10"..., 190019 042 057 073084 092 101 109 117 125 133 141 149 157 165 177 190 204 222 231 240 253 272 293 308 336 357 413 892 905 915 923 931 939 947 955 965 976 988> ) = 190 > write(1, "024 043 059 074 085 093 10"..., 190024 043 059 074085 093 102 110 118 126 134 142 150 158 166 178 193 206 223 232 241 255 274 294 309 339 370 470 895 906 916 924 932 940 948 956 966 977 989> ) = 190 > write(1, "031 044 060 075 086 095 10"..., 190031 044 060 075086 095 103 111 119 127 135 143 151 159 167 179 194 207 224 234 243 257 275 296 310 342 386 517 899 907 917 925 933 941 949 957 967 978 993> ) = 190 > close(1) = 0 > munmap(0x7f7d2494b000, 4096) = 0 > close(2) = 0 > exit_group(0) = ? > +++ exited with 0 +++strace output claims the command exited successfully. Are you sure ls got hung?> > > > >> > >> > Then ... do I need to run something on one of the bricks while straceis>> > running? >> > >> > Cheers, >> > Kingsley. >> > >> > >> > > > >> > > > [root at gluster1b-1 ~]# gluster volume heal callrec info >> > > > Brick gluster1a-1.dns99.co.uk:/data/brick/callrec/ >> > > > <gfid:164f888f-2049-49e6-ad26-c758ee091863> >> > > > /recordings/834723/14391 - Possibly undergoing heal >> > > > >> > > > <gfid:e280b40c-d8b7-43c5-9da7-4737054d7a7f> >> > > > <gfid:b1fbda4a-732f-4f5d-b5a1-8355d786073e> >> > > > <gfid:edb74524-b4b7-4190-85e7-4aad002f6e7c> >> > > > <gfid:9b8b8446-1e27-4113-93c2-6727b1f457eb> >> > > > <gfid:650efeca-b45c-413b-acc3-f0a5853ccebd> >> > > > Number of entries: 7 >> > > > >> > > > Brick gluster1b-1.dns99.co.uk:/data/brick/callrec/ >> > > > Number of entries: 0 >> > > > >> > > > Brick gluster2a-1.dns99.co.uk:/data/brick/callrec/ >> > > > <gfid:e280b40c-d8b7-43c5-9da7-4737054d7a7f> >> > > > <gfid:164f888f-2049-49e6-ad26-c758ee091863> >> > > > <gfid:650efeca-b45c-413b-acc3-f0a5853ccebd> >> > > > <gfid:b1fbda4a-732f-4f5d-b5a1-8355d786073e> >> > > > /recordings/834723/14391 - Possibly undergoing heal >> > > > >> > > > <gfid:edb74524-b4b7-4190-85e7-4aad002f6e7c> >> > > > <gfid:9b8b8446-1e27-4113-93c2-6727b1f457eb> >> > > > Number of entries: 7 >> > > > >> > > > Brick gluster2b-1.dns99.co.uk:/data/brick/callrec/ >> > > > Number of entries: 0 >> > > > >> > > > >> > > > If I query each brick directly for the number of files/directories >> > > > within that, I get 1731 on gluster1a-1 and gluster2a-1, but 1737 on >> > > the >> > > > other two, using this command: >> > > > >> > > > # find /data/brick/callrec/recordings/834723/14391 -print | wc -l >> > > > >> > > > Cheers, >> > > > Kingsley. >> > > > >> > > > On Mon, 2015-08-10 at 11:05 +0100, Kingsley wrote: >> > > > > Sorry for the blind panic - restarting the volume seems to have >> > > fixed >> > > > > it. >> > > > > >> > > > > But then my next question - why is this necessary? Surely it >> > > undermines >> > > > > the whole point of a high availability system? >> > > > > >> > > > > Cheers, >> > > > > Kingsley. >> > > > > >> > > > > On Mon, 2015-08-10 at 10:53 +0100, Kingsley wrote: >> > > > > > Hi, >> > > > > > >> > > > > > We have a 4 way replicated volume using gluster 3.6.3 on CentOS >> > > 7. >> > > > > > >> > > > > > Over the weekend I did a yum update on each of the bricks in >> > > turn, but >> > > > > > now when clients (using fuse mounts) try to access the volume, >> > > it hangs. >> > > > > > Gluster itself wasn't updated (we've disabled that repo so that >> > > we keep >> > > > > > to 3.6.3 for now). >> > > > > > >> > > > > > This was what I did: >> > > > > > >> > > > > > * on first brick, "yum update" >> > > > > > * reboot brick >> > > > > > * watch "gluster volume status" on another brick and wait >> > > for it >> > > > > > to say all 4 bricks are online before proceeding to >> > > update the >> > > > > > next brick >> > > > > > >> > > > > > I was expecting the clients might pause 30 seconds while they >> > > notice a >> > > > > > brick is offline, but then recover. >> > > > > > >> > > > > > I've tried re-mounting clients, but that hasn't helped. >> > > > > > >> > > > > > I can't see much data in any of the log files. >> > > > > > >> > > > > > I've tried "gluster volume heal callrec" but it doesn't seem to >> > > have >> > > > > > helped. >> > > > > > >> > > > > > What shall I do next? >> > > > > > >> > > > > > I've pasted some stuff below in case any of it helps. >> > > > > > >> > > > > > Cheers, >> > > > > > Kingsley. >> > > > > > >> > > > > > [root at gluster1b-1 ~]# gluster volume info callrec >> > > > > > >> > > > > > Volume Name: callrec >> > > > > > Type: Replicate >> > > > > > Volume ID: a39830b7-eddb-4061-b381-39411274131a >> > > > > > Status: Started >> > > > > > Number of Bricks: 1 x 4 = 4 >> > > > > > Transport-type: tcp >> > > > > > Bricks: >> > > > > > Brick1: gluster1a-1:/data/brick/callrec >> > > > > > Brick2: gluster1b-1:/data/brick/callrec >> > > > > > Brick3: gluster2a-1:/data/brick/callrec >> > > > > > Brick4: gluster2b-1:/data/brick/callrec >> > > > > > Options Reconfigured: >> > > > > > performance.flush-behind: off >> > > > > > [root at gluster1b-1 ~]# >> > > > > > >> > > > > > >> > > > > > [root at gluster1b-1 ~]# gluster volume status callrec >> > > > > > Status of volume: callrec >> > > > > > Gluster process Port >> > > Online Pid >> > > > > > >> > >------------------------------------------------------------------------------>> > > > > > Brick gluster1a-1:/data/brick/callrec 49153 >> > > Y 6803 >> > > > > > Brick gluster1b-1:/data/brick/callrec 49153 >> > > Y 2614 >> > > > > > Brick gluster2a-1:/data/brick/callrec 49153 >> > > Y 2645 >> > > > > > Brick gluster2b-1:/data/brick/callrec 49153 >> > > Y 4325 >> > > > > > NFS Server on localhost 2049 >> > > Y 2769 >> > > > > > Self-heal Daemon on localhost N/A >> > > Y 2789 >> > > > > > NFS Server on gluster2a-1 2049 >> > > Y 2857 >> > > > > > Self-heal Daemon on gluster2a-1 N/A >> > > Y 2814 >> > > > > > NFS Server on 88.151.41.100 2049 >> > > Y 6833 >> > > > > > Self-heal Daemon on 88.151.41.100 N/A >> > > Y 6824 >> > > > > > NFS Server on gluster2b-1 2049 >> > > Y 4428 >> > > > > > Self-heal Daemon on gluster2b-1 N/A >> > > Y 4387 >> > > > > > >> > > > > > Task Status of Volume callrec >> > > > > > >> > >------------------------------------------------------------------------------>> > > > > > There are no active volume tasks >> > > > > > >> > > > > > [root at gluster1b-1 ~]# >> > > > > > >> > > > > > >> > > > > > [root at gluster1b-1 ~]# gluster volume heal callrec info >> > > > > > Brick gluster1a-1.dns99.co.uk:/data/brick/callrec/ >> > > > > > /to_process - Possibly undergoing heal >> > > > > > >> > > > > > Number of entries: 1 >> > > > > > >> > > > > > Brick gluster1b-1.dns99.co.uk:/data/brick/callrec/ >> > > > > > Number of entries: 0 >> > > > > > >> > > > > > Brick gluster2a-1.dns99.co.uk:/data/brick/callrec/ >> > > > > > /to_process - Possibly undergoing heal >> > > > > > >> > > > > > Number of entries: 1 >> > > > > > >> > > > > > Brick gluster2b-1.dns99.co.uk:/data/brick/callrec/ >> > > > > > Number of entries: 0 >> > > > > > >> > > > > > [root at gluster1b-1 ~]# >> > > > > > >> > > > > > >> > > > > > _______________________________________________ >> > > > > > Gluster-users mailing list >> > > > > > Gluster-users at gluster.org >> > > > > > http://www.gluster.org/mailman/listinfo/gluster-users >> > > > > > >> > > > > >> > > > > _______________________________________________ >> > > > > Gluster-users mailing list >> > > > > Gluster-users at gluster.org >> > > > > http://www.gluster.org/mailman/listinfo/gluster-users >> > > > > >> > > > >> > > > _______________________________________________ >> > > > Gluster-users mailing list >> > > > Gluster-users at gluster.org >> > > > http://www.gluster.org/mailman/listinfo/gluster-users >> > > >> > > >> > >> >>-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://www.gluster.org/pipermail/gluster-users/attachments/20150810/70e16c20/attachment.html>
Kingsley
2015-Aug-10 17:04 UTC
[Gluster-users] volume not working after yum update - gluster 3.6.3
On Mon, 2015-08-10 at 22:22 +0530, Atin Mukherjee wrote: [snip]> strace output claims the command exited successfully. Are you sure ls > got hung?Not sure, but this one definitely hung. 'mkdir("test", 0777' was the last output, and it's been stuck here for about 7 minutes now: [root at voicemail1b-1 14391.broken]# strace mkdir test execve("/usr/bin/mkdir", ["mkdir", "test"], [/* 27 vars */]) = 0 brk(0) = 0x8db000 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f3468a89000 access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory) open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3 fstat(3, {st_mode=S_IFREG|0644, st_size=31874, ...}) = 0 mmap(NULL, 31874, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f3468a81000 close(3) = 0 open("/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240d\0\0\0\0\0 \0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=147120, ...}) = 0 mmap(NULL, 2246784, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f3468644000 mprotect(0x7f3468665000, 2097152, PROT_NONE) = 0 mmap(0x7f3468865000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED| MAP_DENYWRITE, 3, 0x21000) = 0x7f3468865000 mmap(0x7f3468867000, 6272, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED| MAP_ANONYMOUS, -1, 0) = 0x7f3468867000 close(3) = 0 open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\34\2\0\0\0\0 \0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=2107760, ...}) = 0 mmap(NULL, 3932736, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f3468283000 mprotect(0x7f3468439000, 2097152, PROT_NONE) = 0 mmap(0x7f3468639000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED| MAP_DENYWRITE, 3, 0x1b6000) = 0x7f3468639000 mmap(0x7f346863f000, 16960, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED| MAP_ANONYMOUS, -1, 0) = 0x7f346863f000 close(3) = 0 open("/lib64/libpcre.so.1", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\360\25\0\0\0\0\0 \0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=398272, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f3468a80000 mmap(NULL, 2490888, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f3468022000 mprotect(0x7f3468081000, 2097152, PROT_NONE) = 0 mmap(0x7f3468281000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED| MAP_DENYWRITE, 3, 0x5f000) = 0x7f3468281000 close(3) = 0 open("/lib64/liblzma.so.5", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0000/\0\0\0\0\0 \0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=153184, ...}) = 0 mmap(NULL, 2245240, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f3467dfd000 mprotect(0x7f3467e21000, 2093056, PROT_NONE) = 0 mmap(0x7f3468020000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED| MAP_DENYWRITE, 3, 0x23000) = 0x7f3468020000 close(3) = 0 open("/lib64/libdl.so.2", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\16\0\0\0\0\0 \0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=19512, ...}) = 0 mmap(NULL, 2109744, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f3467bf9000 mprotect(0x7f3467bfc000, 2093056, PROT_NONE) = 0 mmap(0x7f3467dfb000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED| MAP_DENYWRITE, 3, 0x2000) = 0x7f3467dfb000 close(3) = 0 open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240l\0\0\0\0\0 \0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=141616, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f3468a7f000 mmap(NULL, 2208864, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f34679dd000 mprotect(0x7f34679f3000, 2097152, PROT_NONE) = 0 mmap(0x7f3467bf3000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED| MAP_DENYWRITE, 3, 0x16000) = 0x7f3467bf3000 mmap(0x7f3467bf5000, 13408, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED| MAP_ANONYMOUS, -1, 0) = 0x7f3467bf5000 close(3) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f3468a7e000 mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f3468a7c000 arch_prctl(ARCH_SET_FS, 0x7f3468a7c800) = 0 mprotect(0x7f3468639000, 16384, PROT_READ) = 0 mprotect(0x7f3467bf3000, 4096, PROT_READ) = 0 mprotect(0x7f3467dfb000, 4096, PROT_READ) = 0 mprotect(0x7f3468020000, 4096, PROT_READ) = 0 mprotect(0x7f3468281000, 4096, PROT_READ) = 0 mprotect(0x7f3468865000, 4096, PROT_READ) = 0 mprotect(0x611000, 4096, PROT_READ) = 0 mprotect(0x7f3468a8a000, 4096, PROT_READ) = 0 munmap(0x7f3468a81000, 31874) = 0 set_tid_address(0x7f3468a7cad0) = 24942 set_robust_list(0x7f3468a7cae0, 24) = 0 rt_sigaction(SIGRTMIN, {0x7f34679e3780, [], SA_RESTORER|SA_SIGINFO, 0x7f34679ec130}, NULL, 8) = 0 rt_sigaction(SIGRT_1, {0x7f34679e3810, [], SA_RESTORER|SA_RESTART| SA_SIGINFO, 0x7f34679ec130}, NULL, 8) = 0 rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0 getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0 statfs("/sys/fs/selinux", {f_type=0xf97cff8c, f_bsize=4096, f_blocks=0, f_bfree=0, f_bavail=0, f_files=0, f_ffree=0, f_fsid={0, 0}, f_namelen=255, f_frsize=4096}) = 0 statfs("/sys/fs/selinux", {f_type=0xf97cff8c, f_bsize=4096, f_blocks=0, f_bfree=0, f_bavail=0, f_files=0, f_ffree=0, f_fsid={0, 0}, f_namelen=255, f_frsize=4096}) = 0 stat("/sys/fs/selinux", {st_mode=S_IFDIR|0755, st_size=0, ...}) = 0 brk(0) = 0x8db000 brk(0x8fc000) = 0x8fc000 mkdir("test", 0777> > > > > > > > > >> > > >> > Then ... do I need to run something on one of the bricks while > strace is > >> > running? > >> > > >> > Cheers, > >> > Kingsley. > >> > > >> > > >> > > > > >> > > > [root at gluster1b-1 ~]# gluster volume heal callrec info > >> > > > Brick gluster1a-1.dns99.co.uk:/data/brick/callrec/ > >> > > > <gfid:164f888f-2049-49e6-ad26-c758ee091863> > >> > > > /recordings/834723/14391 - Possibly undergoing heal > >> > > > > >> > > > <gfid:e280b40c-d8b7-43c5-9da7-4737054d7a7f> > >> > > > <gfid:b1fbda4a-732f-4f5d-b5a1-8355d786073e> > >> > > > <gfid:edb74524-b4b7-4190-85e7-4aad002f6e7c> > >> > > > <gfid:9b8b8446-1e27-4113-93c2-6727b1f457eb> > >> > > > <gfid:650efeca-b45c-413b-acc3-f0a5853ccebd> > >> > > > Number of entries: 7 > >> > > > > >> > > > Brick gluster1b-1.dns99.co.uk:/data/brick/callrec/ > >> > > > Number of entries: 0 > >> > > > > >> > > > Brick gluster2a-1.dns99.co.uk:/data/brick/callrec/ > >> > > > <gfid:e280b40c-d8b7-43c5-9da7-4737054d7a7f> > >> > > > <gfid:164f888f-2049-49e6-ad26-c758ee091863> > >> > > > <gfid:650efeca-b45c-413b-acc3-f0a5853ccebd> > >> > > > <gfid:b1fbda4a-732f-4f5d-b5a1-8355d786073e> > >> > > > /recordings/834723/14391 - Possibly undergoing heal > >> > > > > >> > > > <gfid:edb74524-b4b7-4190-85e7-4aad002f6e7c> > >> > > > <gfid:9b8b8446-1e27-4113-93c2-6727b1f457eb> > >> > > > Number of entries: 7 > >> > > > > >> > > > Brick gluster2b-1.dns99.co.uk:/data/brick/callrec/ > >> > > > Number of entries: 0 > >> > > > > >> > > > > >> > > > If I query each brick directly for the number of > files/directories > >> > > > within that, I get 1731 on gluster1a-1 and gluster2a-1, but > 1737 on > >> > > the > >> > > > other two, using this command: > >> > > > > >> > > > # find /data/brick/callrec/recordings/834723/14391 -print | > wc -l > >> > > > > >> > > > Cheers, > >> > > > Kingsley. > >> > > > > >> > > > On Mon, 2015-08-10 at 11:05 +0100, Kingsley wrote: > >> > > > > Sorry for the blind panic - restarting the volume seems to > have > >> > > fixed > >> > > > > it. > >> > > > > > >> > > > > But then my next question - why is this necessary? Surely > it > >> > > undermines > >> > > > > the whole point of a high availability system? > >> > > > > > >> > > > > Cheers, > >> > > > > Kingsley. > >> > > > > > >> > > > > On Mon, 2015-08-10 at 10:53 +0100, Kingsley wrote: > >> > > > > > Hi, > >> > > > > > > >> > > > > > We have a 4 way replicated volume using gluster 3.6.3 on > CentOS > >> > > 7. > >> > > > > > > >> > > > > > Over the weekend I did a yum update on each of the bricks > in > >> > > turn, but > >> > > > > > now when clients (using fuse mounts) try to access the > volume, > >> > > it hangs. > >> > > > > > Gluster itself wasn't updated (we've disabled that repo > so that > >> > > we keep > >> > > > > > to 3.6.3 for now). > >> > > > > > > >> > > > > > This was what I did: > >> > > > > > > >> > > > > > * on first brick, "yum update" > >> > > > > > * reboot brick > >> > > > > > * watch "gluster volume status" on another brick > and wait > >> > > for it > >> > > > > > to say all 4 bricks are online before proceeding > to > >> > > update the > >> > > > > > next brick > >> > > > > > > >> > > > > > I was expecting the clients might pause 30 seconds while > they > >> > > notice a > >> > > > > > brick is offline, but then recover. > >> > > > > > > >> > > > > > I've tried re-mounting clients, but that hasn't helped. > >> > > > > > > >> > > > > > I can't see much data in any of the log files. > >> > > > > > > >> > > > > > I've tried "gluster volume heal callrec" but it doesn't > seem to > >> > > have > >> > > > > > helped. > >> > > > > > > >> > > > > > What shall I do next? > >> > > > > > > >> > > > > > I've pasted some stuff below in case any of it helps. > >> > > > > > > >> > > > > > Cheers, > >> > > > > > Kingsley. > >> > > > > > > >> > > > > > [root at gluster1b-1 ~]# gluster volume info callrec > >> > > > > > > >> > > > > > Volume Name: callrec > >> > > > > > Type: Replicate > >> > > > > > Volume ID: a39830b7-eddb-4061-b381-39411274131a > >> > > > > > Status: Started > >> > > > > > Number of Bricks: 1 x 4 = 4 > >> > > > > > Transport-type: tcp > >> > > > > > Bricks: > >> > > > > > Brick1: gluster1a-1:/data/brick/callrec > >> > > > > > Brick2: gluster1b-1:/data/brick/callrec > >> > > > > > Brick3: gluster2a-1:/data/brick/callrec > >> > > > > > Brick4: gluster2b-1:/data/brick/callrec > >> > > > > > Options Reconfigured: > >> > > > > > performance.flush-behind: off > >> > > > > > [root at gluster1b-1 ~]# > >> > > > > > > >> > > > > > > >> > > > > > [root at gluster1b-1 ~]# gluster volume status callrec > >> > > > > > Status of volume: callrec > >> > > > > > Gluster process > Port > >> > > Online Pid > >> > > > > > > >> > > > ------------------------------------------------------------------------------ > >> > > > > > Brick gluster1a-1:/data/brick/callrec > 49153 > >> > > Y 6803 > >> > > > > > Brick gluster1b-1:/data/brick/callrec > 49153 > >> > > Y 2614 > >> > > > > > Brick gluster2a-1:/data/brick/callrec > 49153 > >> > > Y 2645 > >> > > > > > Brick gluster2b-1:/data/brick/callrec > 49153 > >> > > Y 4325 > >> > > > > > NFS Server on localhost > 2049 > >> > > Y 2769 > >> > > > > > Self-heal Daemon on localhost > N/A > >> > > Y 2789 > >> > > > > > NFS Server on gluster2a-1 > 2049 > >> > > Y 2857 > >> > > > > > Self-heal Daemon on gluster2a-1 > N/A > >> > > Y 2814 > >> > > > > > NFS Server on 88.151.41.100 > 2049 > >> > > Y 6833 > >> > > > > > Self-heal Daemon on 88.151.41.100 > N/A > >> > > Y 6824 > >> > > > > > NFS Server on gluster2b-1 > 2049 > >> > > Y 4428 > >> > > > > > Self-heal Daemon on gluster2b-1 > N/A > >> > > Y 4387 > >> > > > > > > >> > > > > > Task Status of Volume callrec > >> > > > > > > >> > > > ------------------------------------------------------------------------------ > >> > > > > > There are no active volume tasks > >> > > > > > > >> > > > > > [root at gluster1b-1 ~]# > >> > > > > > > >> > > > > > > >> > > > > > [root at gluster1b-1 ~]# gluster volume heal callrec info > >> > > > > > Brick gluster1a-1.dns99.co.uk:/data/brick/callrec/ > >> > > > > > /to_process - Possibly undergoing heal > >> > > > > > > >> > > > > > Number of entries: 1 > >> > > > > > > >> > > > > > Brick gluster1b-1.dns99.co.uk:/data/brick/callrec/ > >> > > > > > Number of entries: 0 > >> > > > > > > >> > > > > > Brick gluster2a-1.dns99.co.uk:/data/brick/callrec/ > >> > > > > > /to_process - Possibly undergoing heal > >> > > > > > > >> > > > > > Number of entries: 1 > >> > > > > > > >> > > > > > Brick gluster2b-1.dns99.co.uk:/data/brick/callrec/ > >> > > > > > Number of entries: 0 > >> > > > > > > >> > > > > > [root at gluster1b-1 ~]# > >> > > > > > > >> > > > > > > >> > > > > > _______________________________________________ > >> > > > > > Gluster-users mailing list > >> > > > > > Gluster-users at gluster.org > >> > > > > > http://www.gluster.org/mailman/listinfo/gluster-users > >> > > > > > > >> > > > > > >> > > > > _______________________________________________ > >> > > > > Gluster-users mailing list > >> > > > > Gluster-users at gluster.org > >> > > > > http://www.gluster.org/mailman/listinfo/gluster-users > >> > > > > > >> > > > > >> > > > _______________________________________________ > >> > > > Gluster-users mailing list > >> > > > Gluster-users at gluster.org > >> > > > http://www.gluster.org/mailman/listinfo/gluster-users > >> > > > >> > > > >> > > >> > >> > > > > _______________________________________________ > Gluster-users mailing list > Gluster-users at gluster.org > http://www.gluster.org/mailman/listinfo/gluster-users-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://www.gluster.org/pipermail/gluster-users/attachments/20150810/3e5ab004/attachment.html>