thr3ads.net - Gluster users - [Gluster-users] Unify very slow for 2000 query to cluster / s [Oct 2008]

If this information is useful, please help other people find it:
Share via:

Duc Le Minh

2008-Oct-17 04:00 UTC

[Gluster-users] Unify very slow for 2000 query to cluster / s

I build one cluster by GlusterFS 1.3 with 3 node for brick

Xeon QuardCore 2.33GHz, 32G RAM, 3G Network (bonding), Raid 6 with 12HDD 1T
+ 2  HotSpare

On this cluster i have 3M File with 16TB .

Client
5 node Xeon QuarCore 2.33Ghz, 8GRAM, 2G Network (bonding). Runing Lighttpd
with 600 Concurent Connection for streaming FLV


I run streaming 5Gbps normal, but if run 'ls -l' in directory mount
glusterfs then all system very slow.

Please Help me for solution

This Server Config:
##############################################
###  GlusterFS Server Volume Specification  ##
##############################################


# Unify Volume 1
volume unify-raw1
  type storage/posix
  option directory /home/node1/unify/baamboo/data
end-volume

volume u-posix-locks1
  type features/posix-locks
  option mandatory on
  subvolumes unify-raw1
end-volume

volume u-io-thr1
  type performance/io-threads
  option thread-count 4
  option cache-size 128MB
  subvolumes u-posix-locks1
end-volume

volume u-wb1
  type performance/write-behind
  option aggregate-size 1MB # default is 0bytes
  option flush-behind on    # default is 'off'
  subvolumes u-io-thr1
end-volume

volume unify1
  type performance/read-ahead
  option page-size 512kB        # 256KB is the default option
  option page-count 64           # 2 is default option
  option force-atime-update off # default is off
  subvolumes u-wb1
end-volume


# Unify Volume 2
volume unify-raw2
  type storage/posix
  option directory /home/node2/unify/baamboo/data
end-volume

volume u-posix-locks2
  type features/posix-locks
  option mandatory on
  subvolumes unify-raw2
end-volume

volume u-io-thr2
  type performance/io-threads
  option thread-count 4
  option cache-size 128MB
  subvolumes u-posix-locks2
end-volume

volume u-wb2
  type performance/write-behind
  option aggregate-size 1MB # default is 0bytes
  option flush-behind on    # default is 'off'
  subvolumes u-io-thr2
end-volume

volume unify2
  type performance/read-ahead
  option page-size 512kB        # 256KB is the default option
  option page-count 64           # 2 is default option
  option force-atime-update off # default is off
  subvolumes u-wb2
end-volume




volume ns-raw
  type storage/posix
  option directory /home/node1/unify/baamboo/ns
end-volume

volume ns-io-thr
  type performance/io-threads
  option thread-count 4
  option cache-size 32MB
  subvolumes ns-raw
end-volume

volume ns
  type performance/read-ahead
  option page-size 256kB        # 256KB is the default option
  option page-count 16           # 2 is default option
  option force-atime-update off # default is off
  subvolumes ns-io-thr
end-volume



### Add network serving capability to above brick.
volume server
  type protocol/server
  option transport-type tcp/server     # For TCP/IP transport
  option bind-address 192.168.6.6     # Default is  listen on all interfaces
  option listen-port 60001              # Default is 6996
  subvolumes unify1 unify2 ns
  option auth.ip.unify1.allow 192.168.* # Allow access to "brick"
volume
  option auth.ip.unify2.allow 192.168.* # Allow access to "brick"
volume
  option auth.ip.ns.allow 192.168.* # Allow access to "brick" volume
end-volume


Client Config

### file: client-volume.spec.sample


##############################################
###  GlusterFS Client Volume Specification  ##
##############################################

### Add client feature and attach to remote subvolume
# volume client
#  type protocol/client
#  option transport-type tcp/client     # for TCP/IP transport
# option ib-verbs-work-request-send-size  1048576
# option ib-verbs-work-request-send-count 16
# option ib-verbs-work-request-recv-size  1048576
# option ib-verbs-work-request-recv-count 16
# option transport-type ib-sdp/client  # for Infiniband transport
# option transport-type ib-verbs/client # for ib-verbs transport
#  option remote-host 127.0.0.1         # IP address of the remote brick
# option remote-port 6996              # default server port is 6996

# option transport-timeout 30          # seconds to wait for a reply
                                       # from server for each request
#  option remote-subvolume brick        # name of the remote volume
# end-volume


# Volume for Node 1
volume unify_1
  type protocol/client
  option transport-type tcp/client
  option remote-host 192.168.6.6
  option remote-port 60001
  option remote-subvolume unify1
  option transport-timeout 600          # seconds to wait for a reply
end-volume

# Volume for Node 2
volume unify_2
  type protocol/client
  option transport-type tcp/client
  option remote-host 192.168.6.6
  option remote-port 60001
  option remote-subvolume unify2
  option transport-timeout 600          # seconds to wait for a reply
end-volume

# Volume for Node 3
volume unify_3
  type protocol/client
  option transport-type tcp/client
  option remote-host 192.168.6.8
  option remote-port 60001
  option remote-subvolume unify1
  option transport-timeout 600          # seconds to wait for a reply
end-volume

# Volume for Node 4
volume unify_4
  type protocol/client
  option transport-type tcp/client
  option remote-host 192.168.6.8
  option remote-port 60001
  option remote-subvolume unify2
  option transport-timeout 600          # seconds to wait for a reply
end-volume


# Volume for Node 5
volume unify_5
  type protocol/client
  option transport-type tcp/client
  option remote-host 192.168.6.4
  option remote-port 60001
  option remote-subvolume unify1
  option transport-timeout 600          # seconds to wait for a reply
end-volume


# Volume for Node 6
volume unify_6
  type protocol/client
  option transport-type tcp/client
  option remote-host 192.168.6.4
  option remote-port 60001
  option remote-subvolume unify2
  option transport-timeout 600          # seconds to wait for a reply
end-volume


# Volume for Node NS
# volume unify_ns1
#   type protocol/client
#   option transport-type tcp/client
#   option remote-host 192.168.6.6
#   option remote-port 60001
#   option remote-subvolume ns
#   option transport-timeout 600          # seconds to wait for a reply
# end-volume

# Volume for Node NS
volume unify_ns_raw # unify_ns2
  type protocol/client
  option transport-type tcp/client
  option remote-host 192.168.6.8
  option remote-port 60001
  option remote-subvolume ns
  option transport-timeout 600          # seconds to wait for a reply
end-volume

# Volume for Node NS
# volume unify_ns3
#   type protocol/client
#   option transport-type tcp/client
#   option remote-host 192.168.6.4
#   option remote-port 60001
#   option remote-subvolume ns
#   option transport-timeout 600          # seconds to wait for a reply
# end-volume

# Volume AFR node 1-2-3
# volume unify_ns_raw
#   type cluster/afr
#   subvolumes unify_ns1 unify_ns2 unify_ns3
# end-volume


volume ns_iot
  type performance/io-threads
  option thread-count 4
  option cache-size 256MB
  subvolumes unify_ns_raw
end-volume


# Add readahead feature
volume ns_readahead
  type performance/read-ahead
  option page-size 128kB     # unit in bytes
  option page-count 16       # cache per file  = (page-count x page-size)
  subvolumes ns_iot
end-volume

volume unify_ns  # ns_ioc
  type performance/io-cache
  option cache-size 128MB             # default is 32MB
  option page-size 128KB               #128KB is default option
  # option priority *.h:3,*.html:2,*:1 # default is '*:0'
  option force-revalidate-timeout 2  # default is 1
  subvolumes ns_readahead
end-volume



volume unify
  type cluster/unify
  subvolumes unify_1 unify_2 unify_3 unify_4  unify_5 unify_6
  option namespace unify_ns
  option scheduler rr
  option rr.limits.min-free-disk 25%
  option rr.refresh-interval 10
end-volume

volume iot
  type performance/io-threads
  option thread-count 4
  option cache-size 256MB
  subvolumes unify
end-volume

volume wb
  type performance/write-behind
  option aggregate-size 1MB # default is 0bytes
  option flush-behind on # default is 'off'
  subvolumes iot
end-volume

volume readahead
  type performance/read-ahead
  option page-size 512kB     # unit in bytes
  option page-count 64       # cache per file  = (page-count x page-size)
  subvolumes wb
end-volume

volume ioc
  type performance/io-cache
  option cache-size 1024MB             # default is 32MB
  option page-size 1MB               #128KB is default option
  # option priority *.h:3,*.html:2,*:1 # default is '*:0'
  option force-revalidate-timeout 2  # default is 1
  subvolumes readahead
end-volume


Thanks!



-- 
Le Minh Duc
Email: duclm.vn at gmail.com
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://supercolony.gluster.org/pipermail/gluster-users/attachments/20081017/87710bb5/attachment.html>

Tom Lahti

2008-Nov-06 22:11 UTC

head link

[Gluster-users] {Disarmed} Unify very slow for 2000 query to cluster / s

Streaming fast but directory searches slow = classic filesystem bottleneck.

With 3 million files?  What's the filesystem underneath your glusterfs?  It
isn't JFS is it?  It is well known that JFS performance degrades linearly
with number of files in the filesystem.  XFS isn't particularly fast either
(for directory searches).  You could try to mount with noatime but this will
only give marginal improvement.

For 3 million files, you probably want ext3 with dir_index option instead.

I have 20 million+ files on ext3 with dir_index and its rocket fast to
locate any file, even when not cached.  "ls -l" in any random
directory is
practically instant.

duclm.vn at gmail.com wrote:> I run streaming 5Gbps normal, but if run 'ls -l' in directory mount
> glusterfs then all system very slow.
-- 
-- ===========================   Tom Lahti
   BIT Statement LLC

   (425)251-0833 x 117
   http://www.bitstatement.net/
-- ============================

Tom Lahti

2008-Nov-06 22:47 UTC

head link

[Gluster-users] {Disarmed} Unify very slow for 2000 query to cluster / s

root at somebox:/mnt/cluster/nested/really/deep/here# time ls -l | wc -l
6656

real    0m3.856s
user    0m0.048s
sys     0m0.092s

root at somebox:~# dumpe2fs -h /dev/vg01/cluster
dumpe2fs 1.40.8 (13-Mar-2008)
Filesystem revision #:    1 (dynamic)
Filesystem features:      has_journal ext_attr resize_inode dir_index
filetype needs_recovery sparse_super large_file
Filesystem OS type:       Linux
Inode count:              121372672
Block count:              485490688
Reserved block count:     24274534
Free blocks:              260715390
Free inodes:              114407582
First block:              0
Block size:               4096
Fragment size:            4096
Reserved GDT blocks:      908
Blocks per group:         32768
Fragments per group:      32768
Inodes per group:         8192
Inode blocks per group:   512
RAID stride:              128
RAID stripe width:        256
First inode:              11
Inode size:               256
Journal inode:            8
Default directory hash:   tea
Journal backup:           inode blocks
Journal size:             128M

root at somebox:~# mount | egrep "export|gluster"
/dev/mapper/vg01-cluster on /usr/local/export type ext3 (rw,noatime,reservation)
glusterfs on /mnt/cluster type fuse
(rw,nosuid,nodev,allow_other,default_permissions,max_read=1048576)


Tom Lahti wrote:> I have 20 million+ files on ext3 with dir_index and its rocket fast to
> locate any file, even when not cached.  "ls -l" in any random
directory is
> practically instant.
OK, its only 12 million files.  Sue me :P

By the way, I am re-exporting this with samba and beating the Windows 2003
Servers for performance, both write and read (read in particular) ;)

-- 
-- ===========================   Tom Lahti
   BIT Statement LLC

   (425)251-0833 x 117
   http://www.bitstatement.net/
-- ============================

Andrew McGill

2008-Nov-07 08:05 UTC

head link

[Gluster-users] Unify very slow for 2000 query to cluster / s

On Friday 17 October 2008 06:00:08 Duc Le Minh wrote:> I build one cluster by GlusterFS 1.3 with 3 node for brick
>
> Xeon QuardCore 2.33GHz, 32G RAM, 3G Network (bonding), Raid 6 with 12HDD 1T
> + 2  HotSpare
>
> On this cluster i have 3M File with 16TB .
>
> Client
> 5 node Xeon QuarCore 2.33Ghz, 8GRAM, 2G Network (bonding). Runing Lighttpd
> with 600 Concurent Connection for streaming FLV
>
>
> I run streaming 5Gbps normal, but if run 'ls -l' in directory mount
> glusterfs then all system very slow.
>
> Please Help me for solutionIf you run ls -l, that does a stat() of each file in the directory.
By comparison, echo * is lightning fast, since it only has to do a readdir() 
from the main node.  You won't see a delay on a low-latency network, but in 
the non-developer case, it is sloooooooooooooooow.  

A solution (short of reducing network latency) would be threaded versions of 
filesystem tools which work with multiple files, so that they can copy, move 
and stat files in parallel, and and benefit from filesystem parallelism --

	export GNU_COREUTILS_THREADS=8
	cp   # simultaneous read() and write() sessions
	mv   # simultaneous link and unlink calls
	ls   # parallel stat()

One could also optimise the text utilities like cat by doing the open() and 
stat() operations in parallel and in the background -- userspace read-ahead 
caching.  All of the utilities which process mutliple filenames could get 
better speed from this -- rm, cat, chown, chmod ... even tail, head, wc.

I suspect that one could produce better performance even on a local filesystem 
with some well placed threading in the utilities.
> This Server Config:
> ##############################################
> ###  GlusterFS Server Volume Specification  ##
> ##############################################
>
>
> # Unify Volume 1
> volume unify-raw1
>   type storage/posix
>   option directory /home/node1/unify/baamboo/data
> end-volume
>
> volume u-posix-locks1
>   type features/posix-locks
>   option mandatory on
>   subvolumes unify-raw1
> end-volume
>
> volume u-io-thr1
>   type performance/io-threads
>   option thread-count 4
>   option cache-size 128MB
>   subvolumes u-posix-locks1
> end-volume
>
> volume u-wb1
>   type performance/write-behind
>   option aggregate-size 1MB # default is 0bytes
>   option flush-behind on    # default is 'off'
>   subvolumes u-io-thr1
> end-volume
>
> volume unify1
>   type performance/read-ahead
>   option page-size 512kB        # 256KB is the default option
>   option page-count 64           # 2 is default option
>   option force-atime-update off # default is off
>   subvolumes u-wb1
> end-volume
>
>
> # Unify Volume 2
> volume unify-raw2
>   type storage/posix
>   option directory /home/node2/unify/baamboo/data
> end-volume
>
> volume u-posix-locks2
>   type features/posix-locks
>   option mandatory on
>   subvolumes unify-raw2
> end-volume
>
> volume u-io-thr2
>   type performance/io-threads
>   option thread-count 4
>   option cache-size 128MB
>   subvolumes u-posix-locks2
> end-volume
>
> volume u-wb2
>   type performance/write-behind
>   option aggregate-size 1MB # default is 0bytes
>   option flush-behind on    # default is 'off'
>   subvolumes u-io-thr2
> end-volume
>
> volume unify2
>   type performance/read-ahead
>   option page-size 512kB        # 256KB is the default option
>   option page-count 64           # 2 is default option
>   option force-atime-update off # default is off
>   subvolumes u-wb2
> end-volume
>
>
>
>
> volume ns-raw
>   type storage/posix
>   option directory /home/node1/unify/baamboo/ns
> end-volume
>
> volume ns-io-thr
>   type performance/io-threads
>   option thread-count 4
>   option cache-size 32MB
>   subvolumes ns-raw
> end-volume
>
> volume ns
>   type performance/read-ahead
>   option page-size 256kB        # 256KB is the default option
>   option page-count 16           # 2 is default option
>   option force-atime-update off # default is off
>   subvolumes ns-io-thr
> end-volume
>
>
>
> ### Add network serving capability to above brick.
> volume server
>   type protocol/server
>   option transport-type tcp/server     # For TCP/IP transport
>   option bind-address 192.168.6.6     # Default is  listen on all
> interfaces option listen-port 60001              # Default is 6996
>   subvolumes unify1 unify2 ns
>   option auth.ip.unify1.allow 192.168.* # Allow access to "brick"
volume
>   option auth.ip.unify2.allow 192.168.* # Allow access to "brick"
volume
>   option auth.ip.ns.allow 192.168.* # Allow access to "brick"
volume
> end-volume
>
>
> Client Config
>
> ### file: client-volume.spec.sample
>
>
> ##############################################
> ###  GlusterFS Client Volume Specification  ##
> ##############################################
>
> ### Add client feature and attach to remote subvolume
> # volume client
> #  type protocol/client
> #  option transport-type tcp/client     # for TCP/IP transport
> # option ib-verbs-work-request-send-size  1048576
> # option ib-verbs-work-request-send-count 16
> # option ib-verbs-work-request-recv-size  1048576
> # option ib-verbs-work-request-recv-count 16
> # option transport-type ib-sdp/client  # for Infiniband transport
> # option transport-type ib-verbs/client # for ib-verbs transport
> #  option remote-host 127.0.0.1         # IP address of the remote brick
> # option remote-port 6996              # default server port is 6996
>
> # option transport-timeout 30          # seconds to wait for a reply
>                                        # from server for each request
> #  option remote-subvolume brick        # name of the remote volume
> # end-volume
>
>
> # Volume for Node 1
> volume unify_1
>   type protocol/client
>   option transport-type tcp/client
>   option remote-host 192.168.6.6
>   option remote-port 60001
>   option remote-subvolume unify1
>   option transport-timeout 600          # seconds to wait for a reply
> end-volume
>
> # Volume for Node 2
> volume unify_2
>   type protocol/client
>   option transport-type tcp/client
>   option remote-host 192.168.6.6
>   option remote-port 60001
>   option remote-subvolume unify2
>   option transport-timeout 600          # seconds to wait for a reply
> end-volume
>
> # Volume for Node 3
> volume unify_3
>   type protocol/client
>   option transport-type tcp/client
>   option remote-host 192.168.6.8
>   option remote-port 60001
>   option remote-subvolume unify1
>   option transport-timeout 600          # seconds to wait for a reply
> end-volume
>
> # Volume for Node 4
> volume unify_4
>   type protocol/client
>   option transport-type tcp/client
>   option remote-host 192.168.6.8
>   option remote-port 60001
>   option remote-subvolume unify2
>   option transport-timeout 600          # seconds to wait for a reply
> end-volume
>
>
> # Volume for Node 5
> volume unify_5
>   type protocol/client
>   option transport-type tcp/client
>   option remote-host 192.168.6.4
>   option remote-port 60001
>   option remote-subvolume unify1
>   option transport-timeout 600          # seconds to wait for a reply
> end-volume
>
>
> # Volume for Node 6
> volume unify_6
>   type protocol/client
>   option transport-type tcp/client
>   option remote-host 192.168.6.4
>   option remote-port 60001
>   option remote-subvolume unify2
>   option transport-timeout 600          # seconds to wait for a reply
> end-volume
>
>
> # Volume for Node NS
> # volume unify_ns1
> #   type protocol/client
> #   option transport-type tcp/client
> #   option remote-host 192.168.6.6
> #   option remote-port 60001
> #   option remote-subvolume ns
> #   option transport-timeout 600          # seconds to wait for a reply
> # end-volume
>
> # Volume for Node NS
> volume unify_ns_raw # unify_ns2
>   type protocol/client
>   option transport-type tcp/client
>   option remote-host 192.168.6.8
>   option remote-port 60001
>   option remote-subvolume ns
>   option transport-timeout 600          # seconds to wait for a reply
> end-volume
>
> # Volume for Node NS
> # volume unify_ns3
> #   type protocol/client
> #   option transport-type tcp/client
> #   option remote-host 192.168.6.4
> #   option remote-port 60001
> #   option remote-subvolume ns
> #   option transport-timeout 600          # seconds to wait for a reply
> # end-volume
>
> # Volume AFR node 1-2-3
> # volume unify_ns_raw
> #   type cluster/afr
> #   subvolumes unify_ns1 unify_ns2 unify_ns3
> # end-volume
>
>
> volume ns_iot
>   type performance/io-threads
>   option thread-count 4
>   option cache-size 256MB
>   subvolumes unify_ns_raw
> end-volume
>
>
> # Add readahead feature
> volume ns_readahead
>   type performance/read-ahead
>   option page-size 128kB     # unit in bytes
>   option page-count 16       # cache per file  = (page-count x page-size)
>   subvolumes ns_iot
> end-volume
>
> volume unify_ns  # ns_ioc
>   type performance/io-cache
>   option cache-size 128MB             # default is 32MB
>   option page-size 128KB               #128KB is default option
>   # option priority *.h:3,*.html:2,*:1 # default is '*:0'
>   option force-revalidate-timeout 2  # default is 1
>   subvolumes ns_readahead
> end-volume
>
>
>
> volume unify
>   type cluster/unify
>   subvolumes unify_1 unify_2 unify_3 unify_4  unify_5 unify_6
>   option namespace unify_ns
>   option scheduler rr
>   option rr.limits.min-free-disk 25%
>   option rr.refresh-interval 10
> end-volume
>
> volume iot
>   type performance/io-threads
>   option thread-count 4
>   option cache-size 256MB
>   subvolumes unify
> end-volume
>
> volume wb
>   type performance/write-behind
>   option aggregate-size 1MB # default is 0bytes
>   option flush-behind on # default is 'off'
>   subvolumes iot
> end-volume
>
> volume readahead
>   type performance/read-ahead
>   option page-size 512kB     # unit in bytes
>   option page-count 64       # cache per file  = (page-count x page-size)
>   subvolumes wb
> end-volume
>
> volume ioc
>   type performance/io-cache
>   option cache-size 1024MB             # default is 32MB
>   option page-size 1MB               #128KB is default option
>   # option priority *.h:3,*.html:2,*:1 # default is '*:0'
>   option force-revalidate-timeout 2  # default is 1
>   subvolumes readahead
> end-volume
>
>
> Thanks!

Gluster users - Oct 2008 - Unify very slow for 2000 query to cluster / s

[Gluster-users] Unify very slow for 2000 query to cluster / s

[Gluster-users] {Disarmed} Unify very slow for 2000 query to cluster / s

[Gluster-users] {Disarmed} Unify very slow for 2000 query to cluster / s

[Gluster-users] Unify very slow for 2000 query to cluster / s