Duc Le Minh
2008-Oct-17 04:00 UTC
[Gluster-users] Unify very slow for 2000 query to cluster / s
I build one cluster by GlusterFS 1.3 with 3 node for brick
Xeon QuardCore 2.33GHz, 32G RAM, 3G Network (bonding), Raid 6 with 12HDD 1T
+ 2 HotSpare
On this cluster i have 3M File with 16TB .
Client
5 node Xeon QuarCore 2.33Ghz, 8GRAM, 2G Network (bonding). Runing Lighttpd
with 600 Concurent Connection for streaming FLV
I run streaming 5Gbps normal, but if run 'ls -l' in directory mount
glusterfs then all system very slow.
Please Help me for solution
This Server Config:
##############################################
### GlusterFS Server Volume Specification ##
##############################################
# Unify Volume 1
volume unify-raw1
type storage/posix
option directory /home/node1/unify/baamboo/data
end-volume
volume u-posix-locks1
type features/posix-locks
option mandatory on
subvolumes unify-raw1
end-volume
volume u-io-thr1
type performance/io-threads
option thread-count 4
option cache-size 128MB
subvolumes u-posix-locks1
end-volume
volume u-wb1
type performance/write-behind
option aggregate-size 1MB # default is 0bytes
option flush-behind on # default is 'off'
subvolumes u-io-thr1
end-volume
volume unify1
type performance/read-ahead
option page-size 512kB # 256KB is the default option
option page-count 64 # 2 is default option
option force-atime-update off # default is off
subvolumes u-wb1
end-volume
# Unify Volume 2
volume unify-raw2
type storage/posix
option directory /home/node2/unify/baamboo/data
end-volume
volume u-posix-locks2
type features/posix-locks
option mandatory on
subvolumes unify-raw2
end-volume
volume u-io-thr2
type performance/io-threads
option thread-count 4
option cache-size 128MB
subvolumes u-posix-locks2
end-volume
volume u-wb2
type performance/write-behind
option aggregate-size 1MB # default is 0bytes
option flush-behind on # default is 'off'
subvolumes u-io-thr2
end-volume
volume unify2
type performance/read-ahead
option page-size 512kB # 256KB is the default option
option page-count 64 # 2 is default option
option force-atime-update off # default is off
subvolumes u-wb2
end-volume
volume ns-raw
type storage/posix
option directory /home/node1/unify/baamboo/ns
end-volume
volume ns-io-thr
type performance/io-threads
option thread-count 4
option cache-size 32MB
subvolumes ns-raw
end-volume
volume ns
type performance/read-ahead
option page-size 256kB # 256KB is the default option
option page-count 16 # 2 is default option
option force-atime-update off # default is off
subvolumes ns-io-thr
end-volume
### Add network serving capability to above brick.
volume server
type protocol/server
option transport-type tcp/server # For TCP/IP transport
option bind-address 192.168.6.6 # Default is listen on all interfaces
option listen-port 60001 # Default is 6996
subvolumes unify1 unify2 ns
option auth.ip.unify1.allow 192.168.* # Allow access to "brick"
volume
option auth.ip.unify2.allow 192.168.* # Allow access to "brick"
volume
option auth.ip.ns.allow 192.168.* # Allow access to "brick" volume
end-volume
Client Config
### file: client-volume.spec.sample
##############################################
### GlusterFS Client Volume Specification ##
##############################################
### Add client feature and attach to remote subvolume
# volume client
# type protocol/client
# option transport-type tcp/client # for TCP/IP transport
# option ib-verbs-work-request-send-size 1048576
# option ib-verbs-work-request-send-count 16
# option ib-verbs-work-request-recv-size 1048576
# option ib-verbs-work-request-recv-count 16
# option transport-type ib-sdp/client # for Infiniband transport
# option transport-type ib-verbs/client # for ib-verbs transport
# option remote-host 127.0.0.1 # IP address of the remote brick
# option remote-port 6996 # default server port is 6996
# option transport-timeout 30 # seconds to wait for a reply
# from server for each request
# option remote-subvolume brick # name of the remote volume
# end-volume
# Volume for Node 1
volume unify_1
type protocol/client
option transport-type tcp/client
option remote-host 192.168.6.6
option remote-port 60001
option remote-subvolume unify1
option transport-timeout 600 # seconds to wait for a reply
end-volume
# Volume for Node 2
volume unify_2
type protocol/client
option transport-type tcp/client
option remote-host 192.168.6.6
option remote-port 60001
option remote-subvolume unify2
option transport-timeout 600 # seconds to wait for a reply
end-volume
# Volume for Node 3
volume unify_3
type protocol/client
option transport-type tcp/client
option remote-host 192.168.6.8
option remote-port 60001
option remote-subvolume unify1
option transport-timeout 600 # seconds to wait for a reply
end-volume
# Volume for Node 4
volume unify_4
type protocol/client
option transport-type tcp/client
option remote-host 192.168.6.8
option remote-port 60001
option remote-subvolume unify2
option transport-timeout 600 # seconds to wait for a reply
end-volume
# Volume for Node 5
volume unify_5
type protocol/client
option transport-type tcp/client
option remote-host 192.168.6.4
option remote-port 60001
option remote-subvolume unify1
option transport-timeout 600 # seconds to wait for a reply
end-volume
# Volume for Node 6
volume unify_6
type protocol/client
option transport-type tcp/client
option remote-host 192.168.6.4
option remote-port 60001
option remote-subvolume unify2
option transport-timeout 600 # seconds to wait for a reply
end-volume
# Volume for Node NS
# volume unify_ns1
# type protocol/client
# option transport-type tcp/client
# option remote-host 192.168.6.6
# option remote-port 60001
# option remote-subvolume ns
# option transport-timeout 600 # seconds to wait for a reply
# end-volume
# Volume for Node NS
volume unify_ns_raw # unify_ns2
type protocol/client
option transport-type tcp/client
option remote-host 192.168.6.8
option remote-port 60001
option remote-subvolume ns
option transport-timeout 600 # seconds to wait for a reply
end-volume
# Volume for Node NS
# volume unify_ns3
# type protocol/client
# option transport-type tcp/client
# option remote-host 192.168.6.4
# option remote-port 60001
# option remote-subvolume ns
# option transport-timeout 600 # seconds to wait for a reply
# end-volume
# Volume AFR node 1-2-3
# volume unify_ns_raw
# type cluster/afr
# subvolumes unify_ns1 unify_ns2 unify_ns3
# end-volume
volume ns_iot
type performance/io-threads
option thread-count 4
option cache-size 256MB
subvolumes unify_ns_raw
end-volume
# Add readahead feature
volume ns_readahead
type performance/read-ahead
option page-size 128kB # unit in bytes
option page-count 16 # cache per file = (page-count x page-size)
subvolumes ns_iot
end-volume
volume unify_ns # ns_ioc
type performance/io-cache
option cache-size 128MB # default is 32MB
option page-size 128KB #128KB is default option
# option priority *.h:3,*.html:2,*:1 # default is '*:0'
option force-revalidate-timeout 2 # default is 1
subvolumes ns_readahead
end-volume
volume unify
type cluster/unify
subvolumes unify_1 unify_2 unify_3 unify_4 unify_5 unify_6
option namespace unify_ns
option scheduler rr
option rr.limits.min-free-disk 25%
option rr.refresh-interval 10
end-volume
volume iot
type performance/io-threads
option thread-count 4
option cache-size 256MB
subvolumes unify
end-volume
volume wb
type performance/write-behind
option aggregate-size 1MB # default is 0bytes
option flush-behind on # default is 'off'
subvolumes iot
end-volume
volume readahead
type performance/read-ahead
option page-size 512kB # unit in bytes
option page-count 64 # cache per file = (page-count x page-size)
subvolumes wb
end-volume
volume ioc
type performance/io-cache
option cache-size 1024MB # default is 32MB
option page-size 1MB #128KB is default option
# option priority *.h:3,*.html:2,*:1 # default is '*:0'
option force-revalidate-timeout 2 # default is 1
subvolumes readahead
end-volume
Thanks!
--
Le Minh Duc
Email: duclm.vn at gmail.com
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://supercolony.gluster.org/pipermail/gluster-users/attachments/20081017/87710bb5/attachment.html>
Tom Lahti
2008-Nov-06 22:11 UTC
[Gluster-users] {Disarmed} Unify very slow for 2000 query to cluster / s
Streaming fast but directory searches slow = classic filesystem bottleneck. With 3 million files? What's the filesystem underneath your glusterfs? It isn't JFS is it? It is well known that JFS performance degrades linearly with number of files in the filesystem. XFS isn't particularly fast either (for directory searches). You could try to mount with noatime but this will only give marginal improvement. For 3 million files, you probably want ext3 with dir_index option instead. I have 20 million+ files on ext3 with dir_index and its rocket fast to locate any file, even when not cached. "ls -l" in any random directory is practically instant. duclm.vn at gmail.com wrote:> I run streaming 5Gbps normal, but if run 'ls -l' in directory mount > glusterfs then all system very slow.-- -- =========================== Tom Lahti BIT Statement LLC (425)251-0833 x 117 http://www.bitstatement.net/ -- ============================
Tom Lahti
2008-Nov-06 22:47 UTC
[Gluster-users] {Disarmed} Unify very slow for 2000 query to cluster / s
root at somebox:/mnt/cluster/nested/really/deep/here# time ls -l | wc -l 6656 real 0m3.856s user 0m0.048s sys 0m0.092s root at somebox:~# dumpe2fs -h /dev/vg01/cluster dumpe2fs 1.40.8 (13-Mar-2008) Filesystem revision #: 1 (dynamic) Filesystem features: has_journal ext_attr resize_inode dir_index filetype needs_recovery sparse_super large_file Filesystem OS type: Linux Inode count: 121372672 Block count: 485490688 Reserved block count: 24274534 Free blocks: 260715390 Free inodes: 114407582 First block: 0 Block size: 4096 Fragment size: 4096 Reserved GDT blocks: 908 Blocks per group: 32768 Fragments per group: 32768 Inodes per group: 8192 Inode blocks per group: 512 RAID stride: 128 RAID stripe width: 256 First inode: 11 Inode size: 256 Journal inode: 8 Default directory hash: tea Journal backup: inode blocks Journal size: 128M root at somebox:~# mount | egrep "export|gluster" /dev/mapper/vg01-cluster on /usr/local/export type ext3 (rw,noatime,reservation) glusterfs on /mnt/cluster type fuse (rw,nosuid,nodev,allow_other,default_permissions,max_read=1048576) Tom Lahti wrote:> I have 20 million+ files on ext3 with dir_index and its rocket fast to > locate any file, even when not cached. "ls -l" in any random directory is > practically instant.OK, its only 12 million files. Sue me :P By the way, I am re-exporting this with samba and beating the Windows 2003 Servers for performance, both write and read (read in particular) ;) -- -- =========================== Tom Lahti BIT Statement LLC (425)251-0833 x 117 http://www.bitstatement.net/ -- ============================
Andrew McGill
2008-Nov-07 08:05 UTC
[Gluster-users] Unify very slow for 2000 query to cluster / s
On Friday 17 October 2008 06:00:08 Duc Le Minh wrote:> I build one cluster by GlusterFS 1.3 with 3 node for brick > > Xeon QuardCore 2.33GHz, 32G RAM, 3G Network (bonding), Raid 6 with 12HDD 1T > + 2 HotSpare > > On this cluster i have 3M File with 16TB . > > Client > 5 node Xeon QuarCore 2.33Ghz, 8GRAM, 2G Network (bonding). Runing Lighttpd > with 600 Concurent Connection for streaming FLV > > > I run streaming 5Gbps normal, but if run 'ls -l' in directory mount > glusterfs then all system very slow. > > Please Help me for solutionIf you run ls -l, that does a stat() of each file in the directory. By comparison, echo * is lightning fast, since it only has to do a readdir() from the main node. You won't see a delay on a low-latency network, but in the non-developer case, it is sloooooooooooooooow. A solution (short of reducing network latency) would be threaded versions of filesystem tools which work with multiple files, so that they can copy, move and stat files in parallel, and and benefit from filesystem parallelism -- export GNU_COREUTILS_THREADS=8 cp # simultaneous read() and write() sessions mv # simultaneous link and unlink calls ls # parallel stat() One could also optimise the text utilities like cat by doing the open() and stat() operations in parallel and in the background -- userspace read-ahead caching. All of the utilities which process mutliple filenames could get better speed from this -- rm, cat, chown, chmod ... even tail, head, wc. I suspect that one could produce better performance even on a local filesystem with some well placed threading in the utilities.> This Server Config: > ############################################## > ### GlusterFS Server Volume Specification ## > ############################################## > > > # Unify Volume 1 > volume unify-raw1 > type storage/posix > option directory /home/node1/unify/baamboo/data > end-volume > > volume u-posix-locks1 > type features/posix-locks > option mandatory on > subvolumes unify-raw1 > end-volume > > volume u-io-thr1 > type performance/io-threads > option thread-count 4 > option cache-size 128MB > subvolumes u-posix-locks1 > end-volume > > volume u-wb1 > type performance/write-behind > option aggregate-size 1MB # default is 0bytes > option flush-behind on # default is 'off' > subvolumes u-io-thr1 > end-volume > > volume unify1 > type performance/read-ahead > option page-size 512kB # 256KB is the default option > option page-count 64 # 2 is default option > option force-atime-update off # default is off > subvolumes u-wb1 > end-volume > > > # Unify Volume 2 > volume unify-raw2 > type storage/posix > option directory /home/node2/unify/baamboo/data > end-volume > > volume u-posix-locks2 > type features/posix-locks > option mandatory on > subvolumes unify-raw2 > end-volume > > volume u-io-thr2 > type performance/io-threads > option thread-count 4 > option cache-size 128MB > subvolumes u-posix-locks2 > end-volume > > volume u-wb2 > type performance/write-behind > option aggregate-size 1MB # default is 0bytes > option flush-behind on # default is 'off' > subvolumes u-io-thr2 > end-volume > > volume unify2 > type performance/read-ahead > option page-size 512kB # 256KB is the default option > option page-count 64 # 2 is default option > option force-atime-update off # default is off > subvolumes u-wb2 > end-volume > > > > > volume ns-raw > type storage/posix > option directory /home/node1/unify/baamboo/ns > end-volume > > volume ns-io-thr > type performance/io-threads > option thread-count 4 > option cache-size 32MB > subvolumes ns-raw > end-volume > > volume ns > type performance/read-ahead > option page-size 256kB # 256KB is the default option > option page-count 16 # 2 is default option > option force-atime-update off # default is off > subvolumes ns-io-thr > end-volume > > > > ### Add network serving capability to above brick. > volume server > type protocol/server > option transport-type tcp/server # For TCP/IP transport > option bind-address 192.168.6.6 # Default is listen on all > interfaces option listen-port 60001 # Default is 6996 > subvolumes unify1 unify2 ns > option auth.ip.unify1.allow 192.168.* # Allow access to "brick" volume > option auth.ip.unify2.allow 192.168.* # Allow access to "brick" volume > option auth.ip.ns.allow 192.168.* # Allow access to "brick" volume > end-volume > > > Client Config > > ### file: client-volume.spec.sample > > > ############################################## > ### GlusterFS Client Volume Specification ## > ############################################## > > ### Add client feature and attach to remote subvolume > # volume client > # type protocol/client > # option transport-type tcp/client # for TCP/IP transport > # option ib-verbs-work-request-send-size 1048576 > # option ib-verbs-work-request-send-count 16 > # option ib-verbs-work-request-recv-size 1048576 > # option ib-verbs-work-request-recv-count 16 > # option transport-type ib-sdp/client # for Infiniband transport > # option transport-type ib-verbs/client # for ib-verbs transport > # option remote-host 127.0.0.1 # IP address of the remote brick > # option remote-port 6996 # default server port is 6996 > > # option transport-timeout 30 # seconds to wait for a reply > # from server for each request > # option remote-subvolume brick # name of the remote volume > # end-volume > > > # Volume for Node 1 > volume unify_1 > type protocol/client > option transport-type tcp/client > option remote-host 192.168.6.6 > option remote-port 60001 > option remote-subvolume unify1 > option transport-timeout 600 # seconds to wait for a reply > end-volume > > # Volume for Node 2 > volume unify_2 > type protocol/client > option transport-type tcp/client > option remote-host 192.168.6.6 > option remote-port 60001 > option remote-subvolume unify2 > option transport-timeout 600 # seconds to wait for a reply > end-volume > > # Volume for Node 3 > volume unify_3 > type protocol/client > option transport-type tcp/client > option remote-host 192.168.6.8 > option remote-port 60001 > option remote-subvolume unify1 > option transport-timeout 600 # seconds to wait for a reply > end-volume > > # Volume for Node 4 > volume unify_4 > type protocol/client > option transport-type tcp/client > option remote-host 192.168.6.8 > option remote-port 60001 > option remote-subvolume unify2 > option transport-timeout 600 # seconds to wait for a reply > end-volume > > > # Volume for Node 5 > volume unify_5 > type protocol/client > option transport-type tcp/client > option remote-host 192.168.6.4 > option remote-port 60001 > option remote-subvolume unify1 > option transport-timeout 600 # seconds to wait for a reply > end-volume > > > # Volume for Node 6 > volume unify_6 > type protocol/client > option transport-type tcp/client > option remote-host 192.168.6.4 > option remote-port 60001 > option remote-subvolume unify2 > option transport-timeout 600 # seconds to wait for a reply > end-volume > > > # Volume for Node NS > # volume unify_ns1 > # type protocol/client > # option transport-type tcp/client > # option remote-host 192.168.6.6 > # option remote-port 60001 > # option remote-subvolume ns > # option transport-timeout 600 # seconds to wait for a reply > # end-volume > > # Volume for Node NS > volume unify_ns_raw # unify_ns2 > type protocol/client > option transport-type tcp/client > option remote-host 192.168.6.8 > option remote-port 60001 > option remote-subvolume ns > option transport-timeout 600 # seconds to wait for a reply > end-volume > > # Volume for Node NS > # volume unify_ns3 > # type protocol/client > # option transport-type tcp/client > # option remote-host 192.168.6.4 > # option remote-port 60001 > # option remote-subvolume ns > # option transport-timeout 600 # seconds to wait for a reply > # end-volume > > # Volume AFR node 1-2-3 > # volume unify_ns_raw > # type cluster/afr > # subvolumes unify_ns1 unify_ns2 unify_ns3 > # end-volume > > > volume ns_iot > type performance/io-threads > option thread-count 4 > option cache-size 256MB > subvolumes unify_ns_raw > end-volume > > > # Add readahead feature > volume ns_readahead > type performance/read-ahead > option page-size 128kB # unit in bytes > option page-count 16 # cache per file = (page-count x page-size) > subvolumes ns_iot > end-volume > > volume unify_ns # ns_ioc > type performance/io-cache > option cache-size 128MB # default is 32MB > option page-size 128KB #128KB is default option > # option priority *.h:3,*.html:2,*:1 # default is '*:0' > option force-revalidate-timeout 2 # default is 1 > subvolumes ns_readahead > end-volume > > > > volume unify > type cluster/unify > subvolumes unify_1 unify_2 unify_3 unify_4 unify_5 unify_6 > option namespace unify_ns > option scheduler rr > option rr.limits.min-free-disk 25% > option rr.refresh-interval 10 > end-volume > > volume iot > type performance/io-threads > option thread-count 4 > option cache-size 256MB > subvolumes unify > end-volume > > volume wb > type performance/write-behind > option aggregate-size 1MB # default is 0bytes > option flush-behind on # default is 'off' > subvolumes iot > end-volume > > volume readahead > type performance/read-ahead > option page-size 512kB # unit in bytes > option page-count 64 # cache per file = (page-count x page-size) > subvolumes wb > end-volume > > volume ioc > type performance/io-cache > option cache-size 1024MB # default is 32MB > option page-size 1MB #128KB is default option > # option priority *.h:3,*.html:2,*:1 # default is '*:0' > option force-revalidate-timeout 2 # default is 1 > subvolumes readahead > end-volume > > > Thanks!