Atin Mukherjee
2015-Aug-10 17:23 UTC
[Gluster-users] volume not working after yum update - gluster 3.6.3
-Atin Sent from one plus one On Aug 10, 2015 10:34 PM, "Kingsley" <gluster at gluster.dogwind.com> wrote:> > On Mon, 2015-08-10 at 22:22 +0530, Atin Mukherjee wrote: > [snip] > >> strace output claims the command exited successfully. Are you sure lsgot hung?> > > Not sure, but this one definitely hung. 'mkdir("test", 0777' was the lastoutput, and it's been stuck here for about 7 minutes now:> > [root at voicemail1b-1 14391.broken]# strace mkdir test > execve("/usr/bin/mkdir", ["mkdir", "test"], [/* 27 vars */]) = 0 > brk(0) = 0x8db000 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f3468a89000> > access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file ordirectory)> open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3 > fstat(3, {st_mode=S_IFREG|0644, st_size=31874, ...}) = 0 > mmap(NULL, 31874, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f3468a81000 > > close(3) = 0 > open("/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240d\0\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=147120, ...}) = 0 > mmap(NULL, 2246784, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f3468644000> mprotect(0x7f3468665000, 2097152, PROT_NONE) = 0 > mmap(0x7f3468865000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x21000) = 0x7f3468865000> mmap(0x7f3468867000, 6272, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f3468867000> > close(3) = 0 > open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\34\2\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=2107760, ...}) = 0 > mmap(NULL, 3932736, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f3468283000> mprotect(0x7f3468439000, 2097152, PROT_NONE) = 0 > mmap(0x7f3468639000, 24576, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1b6000) = 0x7f3468639000> mmap(0x7f346863f000, 16960, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f346863f000> > close(3) = 0 > open("/lib64/libpcre.so.1", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\360\25\0\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=398272, ...}) = 0 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f3468a80000> mmap(NULL, 2490888, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f3468022000> mprotect(0x7f3468081000, 2097152, PROT_NONE) = 0 > mmap(0x7f3468281000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x5f000) = 0x7f3468281000> > close(3) = 0 > open("/lib64/liblzma.so.5", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0000/\0\0\0\0\0\0"..., 832) 832> fstat(3, {st_mode=S_IFREG|0755, st_size=153184, ...}) = 0 > mmap(NULL, 2245240, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f3467dfd000> mprotect(0x7f3467e21000, 2093056, PROT_NONE) = 0 > mmap(0x7f3468020000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x23000) = 0x7f3468020000> > close(3) = 0 > open("/lib64/libdl.so.2", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\16\0\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=19512, ...}) = 0 > mmap(NULL, 2109744, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f3467bf9000> mprotect(0x7f3467bfc000, 2093056, PROT_NONE) = 0 > mmap(0x7f3467dfb000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x7f3467dfb000> > close(3) = 0 > open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3 > read(3,"\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240l\0\0\0\0\0\0"..., 832) = 832> fstat(3, {st_mode=S_IFREG|0755, st_size=141616, ...}) = 0 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f3468a7f000> mmap(NULL, 2208864, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)= 0x7f34679dd000> mprotect(0x7f34679f3000, 2097152, PROT_NONE) = 0 > mmap(0x7f3467bf3000, 8192, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x16000) = 0x7f3467bf3000> mmap(0x7f3467bf5000, 13408, PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f3467bf5000> close(3) = 0 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f3468a7e000> mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)= 0x7f3468a7c000> arch_prctl(ARCH_SET_FS, 0x7f3468a7c800) = 0 > mprotect(0x7f3468639000, 16384, PROT_READ) = 0 > mprotect(0x7f3467bf3000, 4096, PROT_READ) = 0 > mprotect(0x7f3467dfb000, 4096, PROT_READ) = 0 > mprotect(0x7f3468020000, 4096, PROT_READ) = 0 > mprotect(0x7f3468281000, 4096, PROT_READ) = 0 > mprotect(0x7f3468865000, 4096, PROT_READ) = 0 > mprotect(0x611000, 4096, PROT_READ) = 0 > mprotect(0x7f3468a8a000, 4096, PROT_READ) = 0 > munmap(0x7f3468a81000, 31874) = 0 > set_tid_address(0x7f3468a7cad0) = 24942 > set_robust_list(0x7f3468a7cae0, 24) = 0 > rt_sigaction(SIGRTMIN, {0x7f34679e3780, [], SA_RESTORER|SA_SIGINFO,0x7f34679ec130}, NULL, 8) = 0> rt_sigaction(SIGRT_1, {0x7f34679e3810, [],SA_RESTORER|SA_RESTART|SA_SIGINFO, 0x7f34679ec130}, NULL, 8) = 0> > rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0 > getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) 0 > statfs("/sys/fs/selinux", {f_type=0xf97cff8c, f_bsize=4096, f_blocks=0,f_bfree=0, f_bavail=0, f_files=0, f_ffree=0, f_fsid={0, 0}, f_namelen=255, f_frsize=4096}) = 0> statfs("/sys/fs/selinux", {f_type=0xf97cff8c, f_bsize=4096, f_blocks=0,f_bfree=0, f_bavail=0, f_files=0, f_ffree=0, f_fsid={0, 0}, f_namelen=255, f_frsize=4096}) = 0> stat("/sys/fs/selinux", {st_mode=S_IFDIR|0755, st_size=0, ...}) = 0 > brk(0) = 0x8db000 > brk(0x8fc000) = 0x8fc000 > mkdir("test", 0777Can you also collect the statedump of all the brick processes when the command is hung? + Ravi, could you check this?> >> > >> > >> > >> > >> >> > >> >> > Then ... do I need to run something on one of the bricks whilestrace is>> >> > running? >> >> > >> >> > Cheers, >> >> > Kingsley. >> >> > >> >> > >> >> > > > >> >> > > > [root at gluster1b-1 ~]# gluster volume heal callrec info >> >> > > > Brick gluster1a-1.dns99.co.uk:/data/brick/callrec/ >> >> > > > <gfid:164f888f-2049-49e6-ad26-c758ee091863> >> >> > > > /recordings/834723/14391 - Possibly undergoing heal >> >> > > > >> >> > > > <gfid:e280b40c-d8b7-43c5-9da7-4737054d7a7f> >> >> > > > <gfid:b1fbda4a-732f-4f5d-b5a1-8355d786073e> >> >> > > > <gfid:edb74524-b4b7-4190-85e7-4aad002f6e7c> >> >> > > > <gfid:9b8b8446-1e27-4113-93c2-6727b1f457eb> >> >> > > > <gfid:650efeca-b45c-413b-acc3-f0a5853ccebd> >> >> > > > Number of entries: 7 >> >> > > > >> >> > > > Brick gluster1b-1.dns99.co.uk:/data/brick/callrec/ >> >> > > > Number of entries: 0 >> >> > > > >> >> > > > Brick gluster2a-1.dns99.co.uk:/data/brick/callrec/ >> >> > > > <gfid:e280b40c-d8b7-43c5-9da7-4737054d7a7f> >> >> > > > <gfid:164f888f-2049-49e6-ad26-c758ee091863> >> >> > > > <gfid:650efeca-b45c-413b-acc3-f0a5853ccebd> >> >> > > > <gfid:b1fbda4a-732f-4f5d-b5a1-8355d786073e> >> >> > > > /recordings/834723/14391 - Possibly undergoing heal >> >> > > > >> >> > > > <gfid:edb74524-b4b7-4190-85e7-4aad002f6e7c> >> >> > > > <gfid:9b8b8446-1e27-4113-93c2-6727b1f457eb> >> >> > > > Number of entries: 7 >> >> > > > >> >> > > > Brick gluster2b-1.dns99.co.uk:/data/brick/callrec/ >> >> > > > Number of entries: 0 >> >> > > > >> >> > > > >> >> > > > If I query each brick directly for the number offiles/directories>> >> > > > within that, I get 1731 on gluster1a-1 and gluster2a-1, but1737 on>> >> > > the >> >> > > > other two, using this command: >> >> > > > >> >> > > > # find /data/brick/callrec/recordings/834723/14391 -print | wc-l>> >> > > > >> >> > > > Cheers, >> >> > > > Kingsley. >> >> > > > >> >> > > > On Mon, 2015-08-10 at 11:05 +0100, Kingsley wrote: >> >> > > > > Sorry for the blind panic - restarting the volume seems tohave>> >> > > fixed >> >> > > > > it. >> >> > > > > >> >> > > > > But then my next question - why is this necessary? Surely it >> >> > > undermines >> >> > > > > the whole point of a high availability system? >> >> > > > > >> >> > > > > Cheers, >> >> > > > > Kingsley. >> >> > > > > >> >> > > > > On Mon, 2015-08-10 at 10:53 +0100, Kingsley wrote: >> >> > > > > > Hi, >> >> > > > > > >> >> > > > > > We have a 4 way replicated volume using gluster 3.6.3 onCentOS>> >> > > 7. >> >> > > > > > >> >> > > > > > Over the weekend I did a yum update on each of the bricks in >> >> > > turn, but >> >> > > > > > now when clients (using fuse mounts) try to access thevolume,>> >> > > it hangs. >> >> > > > > > Gluster itself wasn't updated (we've disabled that repo sothat>> >> > > we keep >> >> > > > > > to 3.6.3 for now). >> >> > > > > > >> >> > > > > > This was what I did: >> >> > > > > > >> >> > > > > > * on first brick, "yum update" >> >> > > > > > * reboot brick >> >> > > > > > * watch "gluster volume status" on another brick andwait>> >> > > for it >> >> > > > > > to say all 4 bricks are online before proceeding to >> >> > > update the >> >> > > > > > next brick >> >> > > > > > >> >> > > > > > I was expecting the clients might pause 30 seconds whilethey>> >> > > notice a >> >> > > > > > brick is offline, but then recover. >> >> > > > > > >> >> > > > > > I've tried re-mounting clients, but that hasn't helped. >> >> > > > > > >> >> > > > > > I can't see much data in any of the log files. >> >> > > > > > >> >> > > > > > I've tried "gluster volume heal callrec" but it doesn'tseem to>> >> > > have >> >> > > > > > helped. >> >> > > > > > >> >> > > > > > What shall I do next? >> >> > > > > > >> >> > > > > > I've pasted some stuff below in case any of it helps. >> >> > > > > > >> >> > > > > > Cheers, >> >> > > > > > Kingsley. >> >> > > > > > >> >> > > > > > [root at gluster1b-1 ~]# gluster volume info callrec >> >> > > > > > >> >> > > > > > Volume Name: callrec >> >> > > > > > Type: Replicate >> >> > > > > > Volume ID: a39830b7-eddb-4061-b381-39411274131a >> >> > > > > > Status: Started >> >> > > > > > Number of Bricks: 1 x 4 = 4 >> >> > > > > > Transport-type: tcp >> >> > > > > > Bricks: >> >> > > > > > Brick1: gluster1a-1:/data/brick/callrec >> >> > > > > > Brick2: gluster1b-1:/data/brick/callrec >> >> > > > > > Brick3: gluster2a-1:/data/brick/callrec >> >> > > > > > Brick4: gluster2b-1:/data/brick/callrec >> >> > > > > > Options Reconfigured: >> >> > > > > > performance.flush-behind: off >> >> > > > > > [root at gluster1b-1 ~]# >> >> > > > > > >> >> > > > > > >> >> > > > > > [root at gluster1b-1 ~]# gluster volume status callrec >> >> > > > > > Status of volume: callrec >> >> > > > > > Gluster process Port >> >> > > Online Pid >> >> > > > > > >> >> > >------------------------------------------------------------------------------>> >> > > > > > Brick gluster1a-1:/data/brick/callrec49153>> >> > > Y 6803 >> >> > > > > > Brick gluster1b-1:/data/brick/callrec49153>> >> > > Y 2614 >> >> > > > > > Brick gluster2a-1:/data/brick/callrec49153>> >> > > Y 2645 >> >> > > > > > Brick gluster2b-1:/data/brick/callrec49153>> >> > > Y 4325 >> >> > > > > > NFS Server on localhost 2049 >> >> > > Y 2769 >> >> > > > > > Self-heal Daemon on localhost N/A >> >> > > Y 2789 >> >> > > > > > NFS Server on gluster2a-1 2049 >> >> > > Y 2857 >> >> > > > > > Self-heal Daemon on gluster2a-1 N/A >> >> > > Y 2814 >> >> > > > > > NFS Server on 88.151.41.100 2049 >> >> > > Y 6833 >> >> > > > > > Self-heal Daemon on 88.151.41.100 N/A >> >> > > Y 6824 >> >> > > > > > NFS Server on gluster2b-1 2049 >> >> > > Y 4428 >> >> > > > > > Self-heal Daemon on gluster2b-1 N/A >> >> > > Y 4387 >> >> > > > > > >> >> > > > > > Task Status of Volume callrec >> >> > > > > > >> >> > >------------------------------------------------------------------------------>> >> > > > > > There are no active volume tasks >> >> > > > > > >> >> > > > > > [root at gluster1b-1 ~]# >> >> > > > > > >> >> > > > > > >> >> > > > > > [root at gluster1b-1 ~]# gluster volume heal callrec info >> >> > > > > > Brick gluster1a-1.dns99.co.uk:/data/brick/callrec/ >> >> > > > > > /to_process - Possibly undergoing heal >> >> > > > > > >> >> > > > > > Number of entries: 1 >> >> > > > > > >> >> > > > > > Brick gluster1b-1.dns99.co.uk:/data/brick/callrec/ >> >> > > > > > Number of entries: 0 >> >> > > > > > >> >> > > > > > Brick gluster2a-1.dns99.co.uk:/data/brick/callrec/ >> >> > > > > > /to_process - Possibly undergoing heal >> >> > > > > > >> >> > > > > > Number of entries: 1 >> >> > > > > > >> >> > > > > > Brick gluster2b-1.dns99.co.uk:/data/brick/callrec/ >> >> > > > > > Number of entries: 0 >> >> > > > > > >> >> > > > > > [root at gluster1b-1 ~]# >> >> > > > > > >> >> > > > > > >> >> > > > > > _______________________________________________ >> >> > > > > > Gluster-users mailing list >> >> > > > > > Gluster-users at gluster.org >> >> > > > > > http://www.gluster.org/mailman/listinfo/gluster-users >> >> > > > > > >> >> > > > > >> >> > > > > _______________________________________________ >> >> > > > > Gluster-users mailing list >> >> > > > > Gluster-users at gluster.org >> >> > > > > http://www.gluster.org/mailman/listinfo/gluster-users >> >> > > > > >> >> > > > >> >> > > > _______________________________________________ >> >> > > > Gluster-users mailing list >> >> > > > Gluster-users at gluster.org >> >> > > > http://www.gluster.org/mailman/listinfo/gluster-users >> >> > > >> >> > > >> >> > >> >> >> >> >> >> >> >> _______________________________________________ >> Gluster-users mailing list >> Gluster-users at gluster.org >> http://www.gluster.org/mailman/listinfo/gluster-users-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://www.gluster.org/pipermail/gluster-users/attachments/20150810/6cd83e13/attachment.html>
Kingsley
2015-Aug-10 18:28 UTC
[Gluster-users] volume not working after yum update - gluster 3.6.3
On Mon, 2015-08-10 at 22:53 +0530, Atin Mukherjee wrote: [snip]> > stat("/sys/fs/selinux", {st_mode=S_IFDIR|0755, st_size=0, ...}) = 0 > > brk(0) = 0x8db000 > > brk(0x8fc000) = 0x8fc000 > > mkdir("test", 0777 > Can you also collect the statedump of all the brick processes when the > command is hung? > > + Ravi, could you check this?I ran the command but I could not find where it put the output: [root at gluster1a-1 ~]# gluster volume statedump callrec all volume statedump: success [root at gluster1a-1 ~]# gluster volume info callrec Volume Name: callrec Type: Replicate Volume ID: a39830b7-eddb-4061-b381-39411274131a Status: Started Number of Bricks: 1 x 4 = 4 Transport-type: tcp Bricks: Brick1: gluster1a-1:/data/brick/callrec Brick2: gluster1b-1:/data/brick/callrec Brick3: gluster2a-1:/data/brick/callrec Brick4: gluster2b-1:/data/brick/callrec Options Reconfigured: performance.flush-behind: off [root at gluster1a-1 ~]#gluster volume status callrec Status of volume: callrec Gluster process Port Online Pid ------------------------------------------------------------------------------ Brick gluster1a-1:/data/brick/callrec 49153 Y 29041 Brick gluster1b-1:/data/brick/callrec 49153 Y 31260 Brick gluster2a-1:/data/brick/callrec 49153 Y 31585 Brick gluster2b-1:/data/brick/callrec 49153 Y 12153 NFS Server on localhost 2049 Y 29733 Self-heal Daemon on localhost N/A Y 29741 NFS Server on gluster1b-1 2049 Y 31872 Self-heal Daemon on gluster1b-1 N/A Y 31882 NFS Server on gluster2a-1 2049 Y 32216 Self-heal Daemon on gluster2a-1 N/A Y 32226 NFS Server on gluster2b-1 2049 Y 12752 Self-heal Daemon on gluster2b-1 N/A Y 12762 Task Status of Volume callrec ------------------------------------------------------------------------------ There are no active volume tasks [root at gluster1a-1 ~]# ls -l /tmp total 144 drwx------. 3 root root 16 Aug 8 22:20 systemd-private-Dp10Pz -rw-------. 1 root root 5818 Jul 31 06:39 yum_save_tx.2015-07-31.06-39.JCvHd5.yumtx -rw-------. 1 root root 5818 Aug 1 06:58 yum_save_tx.2015-08-01.06-58.wBytr2.yumtx -rw-------. 1 root root 5818 Aug 2 05:18 yum_save_tx.2015-08-02.05-18.AXIFSe.yumtx -rw-------. 1 root root 5818 Aug 3 07:15 yum_save_tx.2015-08-03.07-15.EDd8rg.yumtx -rw-------. 1 root root 5818 Aug 4 03:48 yum_save_tx.2015-08-04.03-48.XE513B.yumtx -rw-------. 1 root root 5818 Aug 5 09:03 yum_save_tx.2015-08-05.09-03.mX8xXF.yumtx -rw-------. 1 root root 28869 Aug 6 06:39 yum_save_tx.2015-08-06.06-39.166wJX.yumtx -rw-------. 1 root root 28869 Aug 7 07:20 yum_save_tx.2015-08-07.07-20.rLqJnT.yumtx -rw-------. 1 root root 28869 Aug 8 08:29 yum_save_tx.2015-08-08.08-29.KKaite.yumtx [root at gluster1a-1 ~]# Where should I find the output of the statedump command? Cheers, Kingsley.> > > >> > > >> > > >> > > >> > > >> >> > > >> >> > Then ... do I need to run something on one of the bricks while > strace is > >> >> > running? > >> >> > > >> >> > Cheers, > >> >> > Kingsley. > >> >> > > >> >> > > >> >> > > > > >> >> > > > [root at gluster1b-1 ~]# gluster volume heal callrec info > >> >> > > > Brick gluster1a-1.dns99.co.uk:/data/brick/callrec/ > >> >> > > > <gfid:164f888f-2049-49e6-ad26-c758ee091863> > >> >> > > > /recordings/834723/14391 - Possibly undergoing heal > >> >> > > > > >> >> > > > <gfid:e280b40c-d8b7-43c5-9da7-4737054d7a7f> > >> >> > > > <gfid:b1fbda4a-732f-4f5d-b5a1-8355d786073e> > >> >> > > > <gfid:edb74524-b4b7-4190-85e7-4aad002f6e7c> > >> >> > > > <gfid:9b8b8446-1e27-4113-93c2-6727b1f457eb> > >> >> > > > <gfid:650efeca-b45c-413b-acc3-f0a5853ccebd> > >> >> > > > Number of entries: 7 > >> >> > > > > >> >> > > > Brick gluster1b-1.dns99.co.uk:/data/brick/callrec/ > >> >> > > > Number of entries: 0 > >> >> > > > > >> >> > > > Brick gluster2a-1.dns99.co.uk:/data/brick/callrec/ > >> >> > > > <gfid:e280b40c-d8b7-43c5-9da7-4737054d7a7f> > >> >> > > > <gfid:164f888f-2049-49e6-ad26-c758ee091863> > >> >> > > > <gfid:650efeca-b45c-413b-acc3-f0a5853ccebd> > >> >> > > > <gfid:b1fbda4a-732f-4f5d-b5a1-8355d786073e> > >> >> > > > /recordings/834723/14391 - Possibly undergoing heal > >> >> > > > > >> >> > > > <gfid:edb74524-b4b7-4190-85e7-4aad002f6e7c> > >> >> > > > <gfid:9b8b8446-1e27-4113-93c2-6727b1f457eb> > >> >> > > > Number of entries: 7 > >> >> > > > > >> >> > > > Brick gluster2b-1.dns99.co.uk:/data/brick/callrec/ > >> >> > > > Number of entries: 0 > >> >> > > > > >> >> > > > > >> >> > > > If I query each brick directly for the number of > files/directories > >> >> > > > within that, I get 1731 on gluster1a-1 and gluster2a-1, > but 1737 on > >> >> > > the > >> >> > > > other two, using this command: > >> >> > > > > >> >> > > > # find /data/brick/callrec/recordings/834723/14391 -print > | wc -l > >> >> > > > > >> >> > > > Cheers, > >> >> > > > Kingsley. > >> >> > > > > >> >> > > > On Mon, 2015-08-10 at 11:05 +0100, Kingsley wrote: > >> >> > > > > Sorry for the blind panic - restarting the volume seems > to have > >> >> > > fixed > >> >> > > > > it. > >> >> > > > > > >> >> > > > > But then my next question - why is this necessary? > Surely it > >> >> > > undermines > >> >> > > > > the whole point of a high availability system? > >> >> > > > > > >> >> > > > > Cheers, > >> >> > > > > Kingsley. > >> >> > > > > > >> >> > > > > On Mon, 2015-08-10 at 10:53 +0100, Kingsley wrote: > >> >> > > > > > Hi, > >> >> > > > > > > >> >> > > > > > We have a 4 way replicated volume using gluster 3.6.3 > on CentOS > >> >> > > 7. > >> >> > > > > > > >> >> > > > > > Over the weekend I did a yum update on each of the > bricks in > >> >> > > turn, but > >> >> > > > > > now when clients (using fuse mounts) try to access the > volume, > >> >> > > it hangs. > >> >> > > > > > Gluster itself wasn't updated (we've disabled that > repo so that > >> >> > > we keep > >> >> > > > > > to 3.6.3 for now). > >> >> > > > > > > >> >> > > > > > This was what I did: > >> >> > > > > > > >> >> > > > > > * on first brick, "yum update" > >> >> > > > > > * reboot brick > >> >> > > > > > * watch "gluster volume status" on another brick > and wait > >> >> > > for it > >> >> > > > > > to say all 4 bricks are online before > proceeding to > >> >> > > update the > >> >> > > > > > next brick > >> >> > > > > > > >> >> > > > > > I was expecting the clients might pause 30 seconds > while they > >> >> > > notice a > >> >> > > > > > brick is offline, but then recover. > >> >> > > > > > > >> >> > > > > > I've tried re-mounting clients, but that hasn't > helped. > >> >> > > > > > > >> >> > > > > > I can't see much data in any of the log files. > >> >> > > > > > > >> >> > > > > > I've tried "gluster volume heal callrec" but it > doesn't seem to > >> >> > > have > >> >> > > > > > helped. > >> >> > > > > > > >> >> > > > > > What shall I do next? > >> >> > > > > > > >> >> > > > > > I've pasted some stuff below in case any of it helps. > >> >> > > > > > > >> >> > > > > > Cheers, > >> >> > > > > > Kingsley. > >> >> > > > > > > >> >> > > > > > [root at gluster1b-1 ~]# gluster volume info callrec > >> >> > > > > > > >> >> > > > > > Volume Name: callrec > >> >> > > > > > Type: Replicate > >> >> > > > > > Volume ID: a39830b7-eddb-4061-b381-39411274131a > >> >> > > > > > Status: Started > >> >> > > > > > Number of Bricks: 1 x 4 = 4 > >> >> > > > > > Transport-type: tcp > >> >> > > > > > Bricks: > >> >> > > > > > Brick1: gluster1a-1:/data/brick/callrec > >> >> > > > > > Brick2: gluster1b-1:/data/brick/callrec > >> >> > > > > > Brick3: gluster2a-1:/data/brick/callrec > >> >> > > > > > Brick4: gluster2b-1:/data/brick/callrec > >> >> > > > > > Options Reconfigured: > >> >> > > > > > performance.flush-behind: off > >> >> > > > > > [root at gluster1b-1 ~]# > >> >> > > > > > > >> >> > > > > > > >> >> > > > > > [root at gluster1b-1 ~]# gluster volume status callrec > >> >> > > > > > Status of volume: callrec > >> >> > > > > > Gluster process > Port > >> >> > > Online Pid > >> >> > > > > > > >> >> > > > ------------------------------------------------------------------------------ > >> >> > > > > > Brick gluster1a-1:/data/brick/callrec > 49153 > >> >> > > Y 6803 > >> >> > > > > > Brick gluster1b-1:/data/brick/callrec > 49153 > >> >> > > Y 2614 > >> >> > > > > > Brick gluster2a-1:/data/brick/callrec > 49153 > >> >> > > Y 2645 > >> >> > > > > > Brick gluster2b-1:/data/brick/callrec > 49153 > >> >> > > Y 4325 > >> >> > > > > > NFS Server on localhost > 2049 > >> >> > > Y 2769 > >> >> > > > > > Self-heal Daemon on localhost > N/A > >> >> > > Y 2789 > >> >> > > > > > NFS Server on gluster2a-1 > 2049 > >> >> > > Y 2857 > >> >> > > > > > Self-heal Daemon on gluster2a-1 > N/A > >> >> > > Y 2814 > >> >> > > > > > NFS Server on 88.151.41.100 > 2049 > >> >> > > Y 6833 > >> >> > > > > > Self-heal Daemon on 88.151.41.100 > N/A > >> >> > > Y 6824 > >> >> > > > > > NFS Server on gluster2b-1 > 2049 > >> >> > > Y 4428 > >> >> > > > > > Self-heal Daemon on gluster2b-1 > N/A > >> >> > > Y 4387 > >> >> > > > > > > >> >> > > > > > Task Status of Volume callrec > >> >> > > > > > > >> >> > > > ------------------------------------------------------------------------------ > >> >> > > > > > There are no active volume tasks > >> >> > > > > > > >> >> > > > > > [root at gluster1b-1 ~]# > >> >> > > > > > > >> >> > > > > > > >> >> > > > > > [root at gluster1b-1 ~]# gluster volume heal callrec info > >> >> > > > > > Brick gluster1a-1.dns99.co.uk:/data/brick/callrec/ > >> >> > > > > > /to_process - Possibly undergoing heal > >> >> > > > > > > >> >> > > > > > Number of entries: 1 > >> >> > > > > > > >> >> > > > > > Brick gluster1b-1.dns99.co.uk:/data/brick/callrec/ > >> >> > > > > > Number of entries: 0 > >> >> > > > > > > >> >> > > > > > Brick gluster2a-1.dns99.co.uk:/data/brick/callrec/ > >> >> > > > > > /to_process - Possibly undergoing heal > >> >> > > > > > > >> >> > > > > > Number of entries: 1 > >> >> > > > > > > >> >> > > > > > Brick gluster2b-1.dns99.co.uk:/data/brick/callrec/ > >> >> > > > > > Number of entries: 0 > >> >> > > > > > > >> >> > > > > > [root at gluster1b-1 ~]# > >> >> > > > > > > >> >> > > > > > > >> >> > > > > > _______________________________________________ > >> >> > > > > > Gluster-users mailing list > >> >> > > > > > Gluster-users at gluster.org > >> >> > > > > > http://www.gluster.org/mailman/listinfo/gluster-users > >> >> > > > > > > >> >> > > > > > >> >> > > > > _______________________________________________ > >> >> > > > > Gluster-users mailing list > >> >> > > > > Gluster-users at gluster.org > >> >> > > > > http://www.gluster.org/mailman/listinfo/gluster-users > >> >> > > > > > >> >> > > > > >> >> > > > _______________________________________________ > >> >> > > > Gluster-users mailing list > >> >> > > > Gluster-users at gluster.org > >> >> > > > http://www.gluster.org/mailman/listinfo/gluster-users > >> >> > > > >> >> > > > >> >> > > >> >> > >> >> > >> > >> > >> > >> _______________________________________________ > >> Gluster-users mailing list > >> Gluster-users at gluster.org > >> http://www.gluster.org/mailman/listinfo/gluster-users > > > > _______________________________________________ > Gluster-users mailing list > Gluster-users at gluster.org > http://www.gluster.org/mailman/listinfo/gluster-users-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://www.gluster.org/pipermail/gluster-users/attachments/20150810/bc6c0fd0/attachment.html>