Hi,all
I did some test about MySQL''s Insert performance on ZFS, and met a
big
performance problem,*i''m not sure what''s the point*.
Environment
2 Intel X5560 (8 core), 12GB RAM, 7 slc SSD(Intel).
A Java client run 8 threads concurrency insert into one Innodb table:
*~600 qps when sync_binlog=1 & innodb_flush_log_at_trx_commit=1
~600 qps when sync_binlog=10 & innodb_flush_log_at_trx_commit=1
~600 qps when sync_binlog=0 & innodb_flush_log_at_trx_commit=1
~900 qps when sync_binlog=1 & innodb_flush_log_at_trx_commit=0*
~5500 qps when sync_binlog=10 & innodb_flush_log_at_trx_commit=0
~15000 qps when sync_binlog=0 & innodb_flush_log_at_trx_commit=0
*~800 qps when sync_binlog=1 & innodb_flush_log_at_trx_commit=2*
~4500 qps when sync_binlog=10 & innodb_flush_log_at_trx_commit=2
~13000 qps when sync_binlog=0 & innodb_flush_log_at_trx_commit=2
When sync_binlog=1 & innodb_flush_log_at_trx_commit=1, qps is too...
And i collect some stats data when qps < 1000:
[root at ssd /data/mysqldata3]#truss -c -p 13968
^C
syscall seconds calls errors
read .649 90816 30265
write .770 57157
open .000 4
close .000 4
time .368 83358
lseek .000 66
*fdsync 2.250 80699*
fcntl .268 60530
lwp_park .210 28842
lwp_unpark .198 28842
yield .000 47
pread .025 250
pwrite .857 53880
pollsys .005 603
-------- ------ ----
sys totals: 5.605 485098 30265
usr time: 5.519
elapsed: 61.520
ps:13968 is mysqld process''s pid
[root at ssd /data/mysqldata3]#vmstat 1
kthr memory page disk faults cpu
r b w swap free re mf pi po fr de sr s1 s2 s3 s5 in sy cs us sy
id
0 0 0 575836 1388816 1 18 0 0 0 0 1 3 17 17 17 1226 4185 2008 1 0
99
0 0 0 211652 5620432 21 99 0 0 0 0 0 0 461 470 473 4207 8834 16311 1 1
99
0 0 0 211496 5620260 0 26 0 0 0 0 0 0 469 468 467 4168 8789 16328 1 1
99
0 0 0 211496 5620164 0 28 0 0 0 0 0 0 504 504 505 4471 9443 17606 1 1
98
0 0 0 211496 5620100 0 20 0 0 0 0 0 0 504 505 505 4527 9525 17618 1 1
99
0 0 0 211496 5620004 0 20 0 0 0 0 0 0 507 506 505 4491 9494 17630 1 1
98
0 0 0 211496 5619940 0 12 0 0 0 0 0 0 507 508 509 4512 9497 17743 1 1
98
0 0 0 211496 5619876 0 24 0 0 0 0 0 0 504 502 503 4370 9486 17650 1 1
98
0 0 0 211488 5619804 0 12 0 0 0 0 0 0 508 509 508 4341 9636 17853 0 1
99
^C
[root at ssd /data/mysqldata3]#zpool iostat data 1
capacity operations bandwidth
pool used avail read write read write
---------- ----- ----- ----- ----- ----- -----
data 141G 37.9G 4 51 144K 3.15M
data 141G 37.9G 1 1.50K 11.9K 6.06M
data 141G 37.9G 0 1.37K 0 5.48M
data 141G 37.9G 0 1.49K 0 5.98M
data 141G 37.9G 214 1.45K 5.22M 7.27M
data 141G 37.9G 0 1.37K 0 5.48M
data 141G 37.9G 0 1.39K 0 5.58M
data 141G 37.9G 0 1.48K 0 5.92M
data 141G 37.9G 51 1.50K 1.98M 6.06M
data 141G 37.9G 0 2.09K 0 23.7M
data 141G 37.9G 0 1.38K 0 5.52M
data 141G 37.9G 0 1.37K 7.92K 6.09M
ZFS Conf detail:
[root at ssd /]#zpool status
pool: data
state: ONLINE
scrub: none requested
config:
NAME STATE READ WRITE CKSUM
data ONLINE 0 0 0
c0t4d0 ONLINE 0 0 0
c0t5d0 ONLINE 0 0 0
c0t7d0 ONLINE 0 0 0
c0t8d0 ONLINE 0 0 0
c0t9d0 ONLINE 0 0 0
c0t10d0 ONLINE 0 0 0
errors: No known data errors
pool: rpool
state: ONLINE
scrub: none requested
config:
NAME STATE READ WRITE CKSUM
rpool ONLINE 0 0 0
c0t3d0s0 ONLINE 0 0 0
errors: No known data errors
[root at ssd /]#zfs get all data/mysqldata3
NAME PROPERTY VALUE SOURCE
data/mysqldata3 type filesystem -
data/mysqldata3 creation Thu Mar 19 9:47 2009 -
data/mysqldata3 used 12.4G -
data/mysqldata3 available 83.0G -
data/mysqldata3 referenced 12.4G -
data/mysqldata3 compressratio 1.00x -
data/mysqldata3 mounted yes -
data/mysqldata3 quota none default
data/mysqldata3 reservation none default
data/mysqldata3 recordsize 8K local
data/mysqldata3 mountpoint /data/mysqldata3 default
data/mysqldata3 sharenfs off default
data/mysqldata3 checksum on default
data/mysqldata3 compression off default
data/mysqldata3 atime on default
data/mysqldata3 devices on default
data/mysqldata3 exec on default
data/mysqldata3 setuid on default
data/mysqldata3 readonly off default
data/mysqldata3 zoned off default
data/mysqldata3 snapdir hidden default
data/mysqldata3 aclmode groupmask default
data/mysqldata3 aclinherit restricted default
data/mysqldata3 canmount on default
data/mysqldata3 shareiscsi off default
data/mysqldata3 xattr on default
data/mysqldata3 copies 1 default
data/mysqldata3 version 3 -
data/mysqldata3 utf8only off -
data/mysqldata3 normalization none -
data/mysqldata3 casesensitivity sensitive -
data/mysqldata3 vscan off default
data/mysqldata3 nbmand off default
data/mysqldata3 sharesmb off default
data/mysqldata3 refquota none default
data/mysqldata3 refreservation none default
MySQL Conf Detail:
...
[mysqld3]
lower_case_table_names=1
user=mysql
port = 3308
socket = /usr/local/mysql/sock/mysql3.sock
pid-file=/usr/local/mysql/sock/mysql3.pid
datadir=/data/mysqldata3/mydata
tmpdir =/data/mysqldata3/tmpdir
skip-locking
skip-name-resolve
back_log=100
interactive_timeout=172800
default-storage-engine = INNODB
default-character-set = utf8
max_connections = 500
max_connect_errors=100000
max_allowed_packet = 4M
max_heap_table_size = 1024M
max_length_for_sort_data = 4096
net_buffer_length = 8K
sort_buffer_size = 8M
join_buffer_size = 16M
#if lock memory
#memlock
table_cache = 1024
thread_cache_size = 512
thread_concurrency = 8
query_cache_type=0
#memory table max size
tmp_table_size = 128M
#******************************* Logs related settings
***************************
log-error=/usr/local/mysql/log/error3.log
long_query_time = 1
log_long_format
slow_query_log=/usr/local/mysql/log/slow-query3.log
binlog_cache_size = 2M
max_binlog_size = 512M
log-bin=/data/mysqldata3/binlog/mysql-bin
sync_binlog=1
#log warnings
log_warnings
#******************************* Replication related settings
**********************
#master
server-id = 3
binlog_format=mixed
#******************************* MyISAM Specific options
****************************
key_buffer_size = 32M
read_buffer_size = 8M
read_rnd_buffer_size = 2M
bulk_insert_buffer_size = 16M
myisam_sort_buffer_size = 64M
myisam_max_sort_file_size = 10G
myisam_max_extra_sort_file_size = 10G
myisam_repair_threads = 1
myisam_recover
# ***************************** INNODB Specific options
****************************
innodb_file_per_table
innodb_additional_mem_pool_size = 16M
innodb_buffer_pool_size = 2048M
innodb_data_home_dir = /data/mysqldata3/innodb_ts
innodb_data_file_path = ibdata1:256M:autoextend
innodb_file_io_threads = 4
innodb_thread_concurrency = 0
innodb_flush_log_at_trx_commit = 1
innodb_log_buffer_size = 8M
innodb_log_file_size = 512M
innodb_log_files_in_group = 5
innodb_log_group_home_dir = /data/mysqldata3/innodb_log
innodb_max_dirty_pages_pct = 20
innodb_lock_wait_timeout = 120
innodb_doublewrite=0
...
Is there any one can help me, why fsync on zfs is so bad? or other problem?
--
Sky.Jian iMySQLer
?????http://www.jianzhaoyang.com
BBS???http://iMySQLer.com
????http://groups.google.com/group/mysqler
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://mail.opensolaris.org/pipermail/zfs-discuss/attachments/20090415/0e4bb31e/attachment.html>
On Wed, 15 Apr 2009 14:28:45 +0800, ?????? <sky000 at gmail.com> wrote:> Hi,all > > I did some test about MySQL''s Insert performance > on ZFS, and met a big performance problem, > *i''m not sure what''s the point*.[snip performance and config info]>Is there any one can help me, >why fsync on zfs is so bad? >or other problem?My guess: The InnoDB engine uses copy-on-write internally. zfs adds another layer of copy-on-write. Both try to optimize localization (keep related data close on the disk). Amongst other things this fight between the two causes fragmentation. Performance will get better if someone designs a MySQL storage engine which is aware of zfs and uses zfs copy-on-write primitives. I doubt it will be called InnoDB, because InnoDB is Oracle-owned, and probably not maintained by SUN. -- ( Kees Nuyt ) c[_]
Nicolas Williams
2009-Apr-15 17:45 UTC
[zfs-discuss] MySQL On ZFS Performance(fsync) Problem?
On Wed, Apr 15, 2009 at 07:39:13PM +0200, Kees Nuyt wrote:> On Wed, 15 Apr 2009 14:28:45 +0800, ?????? > <sky000 at gmail.com> wrote: > > I did some test about MySQL''s Insert performance > > on ZFS, and met a big performance problem, > > *i''m not sure what''s the point*.Q1: Did you set the filesystem''s recordsize to match MySQL/InnoDB''s page size? If not, then try doing so (and re-create/copy the DB files to ensure they get the new recordsize). Q2: Did you disable the ZIL? If so then do re-enable it.> [snip performance and config info] > > >Is there any one can help me, > >why fsync on zfs is so bad? > >or other problem? > > My guess: > The InnoDB engine uses copy-on-write internally. > zfs adds another layer of copy-on-write. Both try to > optimize localization (keep related data close on the disk). > > Amongst other things this fight between the two causes > fragmentation.I doubt that''s the problem. On ZFS fsync() would mean syncing more than just the writes to the given file, rather: all the pending writes. To make that go faster ZFS has the ZIL as a way to avoid having to commit an entire ZFS transaction. But even so writes to the ZIL are synchronous. If fsync()s are too slow even with the ZIL enabled then you should put the ZIL on a write-biased flash device if at all possible.> Performance will get better if someone designs a MySQL > storage engine which is aware of zfs and uses zfs > copy-on-write primitives.That may be, but I don''t believe that two layers of COW will cause problems in this case. See my questions above. Nico --
Richard Elling
2009-Apr-16 21:29 UTC
[zfs-discuss] MySQL On ZFS Performance(fsync) Problem?
??? wrote:> Hi,all > > I did some test about MySQL''s Insert performance on ZFS, and met a big > performance problem,*i''m not sure what''s the point*. > > Environment > 2 Intel X5560 (8 core), 12GB RAM, 7 slc SSD(Intel). > > A Java client run 8 threads concurrency insert into one Innodb table: > > *~600 qps when sync_binlog=1 & innodb_flush_log_at_trx_commit=1 > ~600 qps when sync_binlog=10 & innodb_flush_log_at_trx_commit=1 > ~600 qps when sync_binlog=0 & innodb_flush_log_at_trx_commit=1 > > ~900 qps when sync_binlog=1 & innodb_flush_log_at_trx_commit=0* > ~5500 qps when sync_binlog=10 & innodb_flush_log_at_trx_commit=0 > ~15000 qps when sync_binlog=0 & innodb_flush_log_at_trx_commit=0 > > *~800 qps when sync_binlog=1 & innodb_flush_log_at_trx_commit=2* > ~4500 qps when sync_binlog=10 & innodb_flush_log_at_trx_commit=2 > ~13000 qps when sync_binlog=0 & innodb_flush_log_at_trx_commit=2 > > When sync_binlog=1 & innodb_flush_log_at_trx_commit=1, qps is too... > And i collect some stats data when qps < 1000: > > [root at ssd /data/mysqldata3]#truss -c -p 13968 > ^C > syscall seconds calls errors > read .649 90816 30265 > write .770 57157 > open .000 4 > close .000 4 > time .368 83358 > lseek .000 66 > *fdsync 2.250 80699*In my experience, when fdsync() shows up as the biggest bottleneck, then the storage is slow. What does the latency to disk look like? Look at the "iostat" data for service times, not "zpool iostat" which does not report latency. You can also look at zilstat, which will show you how much work is being done by ZFS to handle sync writes. http://richardelling.blogspot.com/2009/02/zilstat-improved.html> fcntl .268 60530 > lwp_park .210 28842 > lwp_unpark .198 28842 > yield .000 47 > pread .025 250 > pwrite .857 53880 > pollsys .005 603 > -------- ------ ---- > sys totals: 5.605 485098 30265 > usr time: 5.519 > elapsed: 61.520 > > ps:13968 is mysqld process''s pid > > [root at ssd /data/mysqldata3]#vmstat 1 > kthr memory page disk faults cpu > r b w swap free re mf pi po fr de sr s1 s2 s3 s5 in sy cs us sy id > 0 0 0 575836 1388816 1 18 0 0 0 0 1 3 17 17 17 1226 4185 2008 1 0 99 > 0 0 0 211652 5620432 21 99 0 0 0 0 0 0 461 470 473 4207 8834 16311 1 1 99 > 0 0 0 211496 5620260 0 26 0 0 0 0 0 0 469 468 467 4168 8789 16328 1 1 99 > 0 0 0 211496 5620164 0 28 0 0 0 0 0 0 504 504 505 4471 9443 17606 1 1 98 > 0 0 0 211496 5620100 0 20 0 0 0 0 0 0 504 505 505 4527 9525 17618 1 1 99 > 0 0 0 211496 5620004 0 20 0 0 0 0 0 0 507 506 505 4491 9494 17630 1 1 98 > 0 0 0 211496 5619940 0 12 0 0 0 0 0 0 507 508 509 4512 9497 17743 1 1 98 > 0 0 0 211496 5619876 0 24 0 0 0 0 0 0 504 502 503 4370 9486 17650 1 1 98 > 0 0 0 211488 5619804 0 12 0 0 0 0 0 0 508 509 508 4341 9636 17853 0 1 99This machine is idle.> ^C > [root at ssd /data/mysqldata3]#zpool iostat data 1 > capacity operations bandwidth > pool used avail read write read write > ---------- ----- ----- ----- ----- ----- ----- > data 141G 37.9G 4 51 144K 3.15M > data 141G 37.9G 1 1.50K 11.9K 6.06M > data 141G 37.9G 0 1.37K 0 5.48M > data 141G 37.9G 0 1.49K 0 5.98M > data 141G 37.9G 214 1.45K 5.22M 7.27M > data 141G 37.9G 0 1.37K 0 5.48M > data 141G 37.9G 0 1.39K 0 5.58M > data 141G 37.9G 0 1.48K 0 5.92M > data 141G 37.9G 51 1.50K 1.98M 6.06M > data 141G 37.9G 0 2.09K 0 23.7M > data 141G 37.9G 0 1.38K 0 5.52M > data 141G 37.9G 0 1.37K 7.92K 6.09MThis is a small workload, perhaps because the machine is idle? -- richard> > > ZFS Conf detail: > [root at ssd /]#zpool status > pool: data > state: ONLINE > scrub: none requested > config: > > NAME STATE READ WRITE CKSUM > data ONLINE 0 0 0 > c0t4d0 ONLINE 0 0 0 > c0t5d0 ONLINE 0 0 0 > c0t7d0 ONLINE 0 0 0 > c0t8d0 ONLINE 0 0 0 > c0t9d0 ONLINE 0 0 0 > c0t10d0 ONLINE 0 0 0 > > errors: No known data errors > > pool: rpool > state: ONLINE > scrub: none requested > config: > > NAME STATE READ WRITE CKSUM > rpool ONLINE 0 0 0 > c0t3d0s0 ONLINE 0 0 0 > > errors: No known data errors > > > [root at ssd /]#zfs get all data/mysqldata3 > NAME PROPERTY VALUE SOURCE > data/mysqldata3 type filesystem - > data/mysqldata3 creation Thu Mar 19 9:47 2009 - > data/mysqldata3 used 12.4G - > data/mysqldata3 available 83.0G - > data/mysqldata3 referenced 12.4G - > data/mysqldata3 compressratio 1.00x - > data/mysqldata3 mounted yes - > data/mysqldata3 quota none default > data/mysqldata3 reservation none default > data/mysqldata3 recordsize 8K local > data/mysqldata3 mountpoint /data/mysqldata3 default > data/mysqldata3 sharenfs off default > data/mysqldata3 checksum on default > data/mysqldata3 compression off default > data/mysqldata3 atime on default > data/mysqldata3 devices on default > data/mysqldata3 exec on default > data/mysqldata3 setuid on default > data/mysqldata3 readonly off default > data/mysqldata3 zoned off default > data/mysqldata3 snapdir hidden default > data/mysqldata3 aclmode groupmask default > data/mysqldata3 aclinherit restricted default > data/mysqldata3 canmount on default > data/mysqldata3 shareiscsi off default > data/mysqldata3 xattr on default > data/mysqldata3 copies 1 default > data/mysqldata3 version 3 - > data/mysqldata3 utf8only off - > data/mysqldata3 normalization none - > data/mysqldata3 casesensitivity sensitive - > data/mysqldata3 vscan off default > data/mysqldata3 nbmand off default > data/mysqldata3 sharesmb off default > data/mysqldata3 refquota none default > data/mysqldata3 refreservation none default > > MySQL Conf Detail: > ... > [mysqld3] > lower_case_table_names=1 > user=mysql > port = 3308 > socket = /usr/local/mysql/sock/mysql3.sock > pid-file=/usr/local/mysql/sock/mysql3.pid > datadir=/data/mysqldata3/mydata > tmpdir =/data/mysqldata3/tmpdir > skip-locking > skip-name-resolve > back_log=100 > interactive_timeout=172800 > > default-storage-engine = INNODB > default-character-set = utf8 > > max_connections = 500 > max_connect_errors=100000 > max_allowed_packet = 4M > max_heap_table_size = 1024M > max_length_for_sort_data = 4096 > > > net_buffer_length = 8K > sort_buffer_size = 8M > join_buffer_size = 16M > > #if lock memory > #memlock > > table_cache = 1024 > thread_cache_size = 512 > thread_concurrency = 8 > query_cache_type=0 > > #memory table max size > tmp_table_size = 128M > > > #******************************* Logs related settings > *************************** > log-error=/usr/local/mysql/log/error3.log > long_query_time = 1 > log_long_format > slow_query_log=/usr/local/mysql/log/slow-query3.log > binlog_cache_size = 2M > max_binlog_size = 512M > log-bin=/data/mysqldata3/binlog/mysql-bin > sync_binlog=1 > #log warnings > log_warnings > > #******************************* Replication related settings > ********************** > #master > server-id = 3 > > binlog_format=mixed > > #******************************* MyISAM Specific options > **************************** > key_buffer_size = 32M > read_buffer_size = 8M > read_rnd_buffer_size = 2M > bulk_insert_buffer_size = 16M > myisam_sort_buffer_size = 64M > myisam_max_sort_file_size = 10G > myisam_max_extra_sort_file_size = 10G > myisam_repair_threads = 1 > myisam_recover > > # ***************************** INNODB Specific options > **************************** > innodb_file_per_table > innodb_additional_mem_pool_size = 16M > innodb_buffer_pool_size = 2048M > innodb_data_home_dir = /data/mysqldata3/innodb_ts > innodb_data_file_path = ibdata1:256M:autoextend > innodb_file_io_threads = 4 > innodb_thread_concurrency = 0 > innodb_flush_log_at_trx_commit = 1 > innodb_log_buffer_size = 8M > innodb_log_file_size = 512M > innodb_log_files_in_group = 5 > innodb_log_group_home_dir = /data/mysqldata3/innodb_log > innodb_max_dirty_pages_pct = 20 > innodb_lock_wait_timeout = 120 > innodb_doublewrite=0 > ... > > Is there any one can help me, why fsync on zfs is so bad? or other > problem? > > > -- > Sky.Jian iMySQLer > ?????http://www.jianzhaoyang.com > BBS???http://iMySQLer.com > ????http://groups.google.com/group/mysqler > > ------------------------------------------------------------------------ > > _______________________________________________ > zfs-discuss mailing list > zfs-discuss at opensolaris.org > http://mail.opensolaris.org/mailman/listinfo/zfs-discuss >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://mail.opensolaris.org/pipermail/zfs-discuss/attachments/20090416/3f5d55b6/attachment.html>