From 95c0ec5cb26efbe2c5dbf45df21518d8d1776be0 Mon Sep 17 00:00:00 2001 From: Don Penney Date: Wed, 4 Jan 2017 12:15:53 -0500 Subject: [PATCH] TIS Patches This patch rolls up the previous TIS patches, which includes: 1. CGTS-4787 Set DRBD service ensure parameter 2. Updates to fix DRBD resync-rate and engineered parameters: There are several DRBD performance related parameters that must be set to get reasonable resync performance, otherwise default resync throughput is limited to 40MB/s. Note that user community has noted this limit when they use default settings, or up-rev DRBD from 8.3, etc. Eg. they realize they hit this limit despite having 10G link or better and faster disks. The following parameters were added to puppet-drbd module for resource file generation, in addition to: c-plan-ahead, c-fill-target, c-min-rate, c-max-rate, currently engineered for dynamic resync-rates. disk section: - 'resync-rate' (aka 'rate') was missed in the CentOS port from Kilo - 'al-extents' set to 3389, set to a prime number. Increasing this improves random write throughput. Could set a bit higher, but would need a study. net section: - 'max-buffers' engineered to scale with supported MBps, setting too low (eg., default setting) is a bottleneck on 10G link. Set this to maximum settable value of 20000. Note this parm may be settable to larger values in more current DRBD rev. If we need to support faster disks, likely need to increase this proportionately. - 'max-epoch-size' also set to 20000. DRBD tuning recommendation page sets this the same as max-buffers. - 'unplug-watermark' set to 16 based on DRBD tuning recommendations page - 'sndbuf-size' set to 0 to auto-tune; historically default was too small - 'rcvbuf-size' set to 0 to auto-tune --- manifests/init.pp | 11 ++-- manifests/resource.pp | 93 +++++++++++++++++++++++++--- manifests/resource/up.pp | 2 +- manifests/service.pp | 2 +- templates/header.res.erb | 53 ++++++++++++++-- templates/primary-resource.res.erb | 2 +- templates/primary-stacked-resource.res.erb | 2 +- templates/resource.res.erb | 2 +- templates/secondary-resource.res.erb | 2 +- templates/secondary-stacked-resource.res.erb | 2 +- 10 files changed, 148 insertions(+), 23 deletions(-) diff --git a/manifests/init.pp b/manifests/init.pp index 09f7d48..76ce9c9 100644 --- a/manifests/init.pp +++ b/manifests/init.pp @@ -6,7 +6,8 @@ # class drbd( $service_enable = true, - $package_name = 'drbd8-utils', + $service_ensure = 'running', + $package_name = 'drbd-utils', ) { include ::drbd::service @@ -22,7 +23,7 @@ class drbd( } File { - mode => '0644', + mode => '0640', owner => 'root', group => 'root', require => Package['drbd'], @@ -45,8 +46,10 @@ class drbd( # only allow files managed by puppet in this directory. file { '/etc/drbd.d': ensure => directory, - mode => '0644', - purge => true, + mode => '0640', + # Set purge to false so that it does not clear the dir + # when the 2nd drbd resource is added. + purge => false, recurse => true, force => true, require => Package['drbd'], diff --git a/manifests/resource.pp b/manifests/resource.pp index af2ff77..10edc1a 100644 --- a/manifests/resource.pp +++ b/manifests/resource.pp @@ -22,6 +22,10 @@ # [ha_primary] If the resource is being applied on the primary host. # [initial_setup] If this run is associated with the initial setup. Allows a user # to only perform dangerous setup on the initial run. +# [link_util] replication link network utilization percent +# [link_speed] replication link network speed mbps +# [num_parallel] number of parallel drbd filesystems to sync +# [rtt_ms] round-trip-time milliseconds (i.e., ping between replication nodes) define drbd::resource ( $host1 = undef, $host2 = undef, @@ -39,7 +43,10 @@ define drbd::resource ( $group = 'root', $protocol = 'C', $verify_alg = 'crc32c', - $rate = false, + $link_util = false, + $link_speed = false, + $num_parallel = false, + $rtt_ms = false, $net_parameters = false, $manage = true, $ha_primary = false, @@ -47,6 +54,7 @@ define drbd::resource ( $fs_type = 'ext4', $mkfs_opts = '', $disk = undef, + $handlers = false, ) { include ::drbd @@ -67,6 +75,75 @@ define drbd::resource ( group => $group, } + if $link_util and $link_speed and $num_parallel and $rtt_ms { + # Engineer drbd variable sync rate parameters based on the following: + # https://blogs.linbit.com/p/128/drbd-sync-rate-controller/ + # https://blogs.linbit.com/p/443/drbd-sync-rate-controller-2/ + # Methodology adapted to account for replication link speed and parallelism. + + # Since there is no aggregate bandwidth control, prorate the drbd + # replication bandwidth based on parallelism. + # Based on experimentation, it seems generally better to set num_parallel + # to 1 and let DRBD auto-regulate its throughput. The end result is that + # multiple competing filesystems (i.e., on same disk device) already have + # their sync throughput reduced. + $mbps = $link_speed / $num_parallel + + # bandwidth delay product + $bdp_k = $mbps * $rtt_ms + + # engineer initial sync rate as percent of link bandwidth + $rate_M = floor($link_util * $mbps / 8 / 100) + $rate = "${rate_M}M" + + # engineer c_plan_ahead to default value (tenths) + # Documentation indicates this value OK even for 200 ms RTT. + $c_plan_ahead = 20 + + # engineer c_fill_target as 1*BDP (tune within 1x to 3x BDP; + # choose minimum value that saturates bandwidth) + $fill_target_k = floor(1 * $bdp_k) + $c_fill_target = "${fill_target_k}k" + + # engineer c_min_rate -- experimentally determined so DRBD is not + # throttled to a crawl even when there is minimal application IO. + # DRBD default is way too small. + $min_rate_M = 15 + floor($link_util * $mbps / 8 / 100 / 25) + $c_min_rate = "${min_rate_M}M" + + # engineer c_max_rate as percent of link bandwidth + $max_rate_M = floor($link_util * $mbps / 8 / 100) + $c_max_rate = "${max_rate_M}M" + + # various tuning settings to enable larger link bandwidth (eg, 10G) + # max_buffers should scale with MBps; set to maximum settable + $max_buffers = 20000 + $max_epoch_size = 20000 + $unplug_watermark = 16 + # sndbuf_size and rcvbuf_size should scale with mbps; set 0 to auto-tune + $sndbuf_size = 0 + $rcvbuf_size = 0 + # increase al_extents to improve random write throughput; set to prime number + $al_extents = 3389 + } else { + # disable variable sync rate + $c_plan_ahead = 0 + $c_fill_target = false + $c_min_rate = false + $c_max_rate = false + + # engineer fixed sync rate at 40 percent of 1G + $rate_M = floor(40 * 1000 / 8 / 100) + $rate = "${rate_M}M" + + $max_buffers = false + $max_epoch_size = false + $unplug_watermark = false + $sndbuf_size = false + $rcvbuf_size = false + $al_extents = false + } + concat { "/etc/drbd.d/${name}.res": mode => '0600', require => [ @@ -94,13 +171,13 @@ define drbd::resource ( } # Export our fragment for the clustered node if $ha_primary and $cluster { - @@concat::fragment { "${name} ${cluster} primary resource": + concat::fragment { "${name} ${cluster} primary resource": target => "/etc/drbd.d/${name}.res", content => template('drbd/resource.res.erb'), order => '10', } } elsif $cluster { - @@concat::fragment { "${name} ${cluster} secondary resource": + concat::fragment { "${name} ${cluster} secondary resource": target => "/etc/drbd.d/${name}.res", content => template('drbd/resource.res.erb'), order => '20', @@ -137,11 +214,11 @@ define drbd::resource ( order => '99', } - if $cluster { - # Import cluster nodes - Concat::Fragment <<| title == "${name} ${cluster} primary resource" |>> - Concat::Fragment <<| title == "${name} ${cluster} secondary resource" |>> - } +# if $cluster { +# # Import cluster nodes +# Concat::Fragment <<| title == "${name} ${cluster} primary resource" |>> +# Concat::Fragment <<| title == "${name} ${cluster} secondary resource" |>> +# } # Due to a bug in puppet, defined() conditionals must be in a defined # resource to be evaluated *after* the collector instead of before. diff --git a/manifests/resource/up.pp b/manifests/resource/up.pp index 7668792..b626f55 100644 --- a/manifests/resource/up.pp +++ b/manifests/resource/up.pp @@ -70,7 +70,7 @@ define drbd::resource::up ( # ensure that the device is mounted mount { $mountpoint: ensure => mounted, - atboot => false, + atboot => yes, device => $device, fstype => 'auto', options => 'defaults,noauto', diff --git a/manifests/service.pp b/manifests/service.pp index de56b34..f9b217a 100644 --- a/manifests/service.pp +++ b/manifests/service.pp @@ -1,6 +1,6 @@ class drbd::service { @service { 'drbd': - ensure => running, + ensure => $drbd::service_ensure, enable => $drbd::service_enable, require => Package['drbd'], restart => 'service drbd reload', diff --git a/templates/header.res.erb b/templates/header.res.erb index 2d785c4..a3256a3 100644 --- a/templates/header.res.erb +++ b/templates/header.res.erb @@ -5,7 +5,32 @@ resource <%= @name %> { disk <%= @disk %>; meta-disk internal; + disk { +<% if @rate -%> + resync-rate <%= @rate %>; +<% end -%> +<% if @c_plan_ahead -%> + c-plan-ahead <%= @c_plan_ahead %>; +<% end -%> +<% if @c_fill_target -%> + c-fill-target <%= @c_fill_target %>; +<% end -%> +<% if @c_min_rate -%> + c-min-rate <%= @c_min_rate %>; +<% end -%> +<% if @c_max_rate -%> + c-max-rate <%= @c_max_rate %>; +<% end -%> +<% if @al_extents -%> + al-extents <%= @al_extents %>; +<% end -%> + } + net { + after-sb-0pri discard-zero-changes; + after-sb-1pri discard-secondary; + after-sb-2pri disconnect; + cram-hmac-alg sha1; <% if @secret -%> shared-secret "<%= @secret %>"; @@ -16,12 +41,32 @@ resource <%= @name %> { <%= k %> <%= v %>; <% end -%> <% end -%> - } - syncer { +<% if @max_buffers -%> + max-buffers <%= @max_buffers %>; +<% end -%> +<% if @max_epoch_size -%> + max-epoch-size <%= @max_epoch_size %>; +<% end -%> +<% if @unplug_watermark -%> + unplug-watermark <%= @unplug_watermark %>; +<% end -%> +<% if @sndbuf_size -%> + sndbuf-size <%= @sndbuf_size %>; +<% end -%> +<% if @rcvbuf_size -%> + rcvbuf-size <%= @rcvbuf_size %>; +<% end -%> +<% if @verify_alg -%> verify-alg <%= @verify_alg %>; -<% if @rate -%> - rate <%= @rate %>; <% end -%> } +<% if @handlers -%> + handlers { +<% @handlers.sort_by {|k, v| k}.each do |k, v| -%> + <%= k %> "<%= v %>"; +<% end -%> + } +<% end -%> + diff --git a/templates/primary-resource.res.erb b/templates/primary-resource.res.erb index f8af77e..6032fd2 100644 --- a/templates/primary-resource.res.erb +++ b/templates/primary-resource.res.erb @@ -1,3 +1,3 @@ on <%= @host1 %> { - address <%= @ip1 %>:<%= @port %>; + address <%= IPAddr.new(@ip1).ipv6?() ? "ipv6 ["+@ip1+"]:"+@port : "ipv4 "+@ip1+":"+@port %>; } diff --git a/templates/primary-stacked-resource.res.erb b/templates/primary-stacked-resource.res.erb index 7eb4dad..a22d8b3 100644 --- a/templates/primary-stacked-resource.res.erb +++ b/templates/primary-stacked-resource.res.erb @@ -1,3 +1,3 @@ stacked-on-top-of <%= @res1 %> { - address <%= @ip1 %>:<%= @port %>; + address <%= IPAddr.new(ip1).ipv6?() ? "ipv6 ["+ip1+"]:"+port : "ipv4 "+ip1+":"+port %>; } diff --git a/templates/resource.res.erb b/templates/resource.res.erb index 047877e..9dd4c4d 100644 --- a/templates/resource.res.erb +++ b/templates/resource.res.erb @@ -1,3 +1,3 @@ on <%= @hostname %> { - address <%= @ipaddress %>:<%= @port %>; + address <%= IPAddr.new(ipaddress).ipv6?() ? "ipv6 ["+ipaddress+"]:"+@port : "ipv4 "+ipaddress+":"+port %>; } diff --git a/templates/secondary-resource.res.erb b/templates/secondary-resource.res.erb index 678640a..cf2fd96 100644 --- a/templates/secondary-resource.res.erb +++ b/templates/secondary-resource.res.erb @@ -1,3 +1,3 @@ on <%= @host2 %> { - address <%= @ip2 %>:<%= @port %>; + address <%= IPAddr.new(@ip2).ipv6?() ? "ipv6 ["+@ip2+"]:"+@port : "ipv4 "+@ip2+":"+@port %>; } diff --git a/templates/secondary-stacked-resource.res.erb b/templates/secondary-stacked-resource.res.erb index 409a705..87d28f5 100644 --- a/templates/secondary-stacked-resource.res.erb +++ b/templates/secondary-stacked-resource.res.erb @@ -1,3 +1,3 @@ stacked-on-top-of <%= @res2 %> { - address <%= @ip2 %>:<%= @port %>; + address <%= IPAddr.new(ip2).ipv6?() ? "ipv6 ["+ip2+"]:"+port : "ipv4 "+ip2+":"+port %>; } -- 1.8.3.1