[libvirt] [PATCH v3 3/8] bandwidth: Create hierarchical shaping classes

Laine Stump laine at laine.org
Tue Dec 11 16:30:18 UTC 2012


On 12/11/2012 11:09 AM, Michal Privoznik wrote:
> These classes can borrow unused bandwidth. Basically,
> only egress qdsics can have classes, therefore we can

s/qdsic/qdisc/

> do this kind of traffic shaping only on host's outgoing,
> that is domain's incoming traffic.
> ---
>  src/lxc/lxc_process.c         |    3 +-
>  src/network/bridge_driver.c   |    3 +-
>  src/qemu/qemu_command.c       |    3 +-
>  src/qemu/qemu_driver.c        |    2 +-
>  src/util/virnetdevbandwidth.c |   93 +++++++++++++++++++++++++++++++++++++---
>  src/util/virnetdevbandwidth.h |    4 +-
>  src/util/virnetdevmacvlan.c   |    2 +-
>  7 files changed, 97 insertions(+), 13 deletions(-)
>
> diff --git a/src/lxc/lxc_process.c b/src/lxc/lxc_process.c
> index 50c61c5..3e7fcb8 100644
> --- a/src/lxc/lxc_process.c
> +++ b/src/lxc/lxc_process.c
> @@ -341,7 +341,8 @@ static int virLXCProcessSetupInterfaceBridged(virConnectPtr conn,
>          goto cleanup;
>  
>      if (virNetDevBandwidthSet(net->ifname,
> -                              virDomainNetGetActualBandwidth(net)) < 0) {
> +                              virDomainNetGetActualBandwidth(net),
> +                              false) < 0) {
>          virReportError(VIR_ERR_INTERNAL_ERROR,
>                         _("cannot set bandwidth limits on %s"),
>                         net->ifname);
> diff --git a/src/network/bridge_driver.c b/src/network/bridge_driver.c
> index 00cffee..58f1d2e 100644
> --- a/src/network/bridge_driver.c
> +++ b/src/network/bridge_driver.c
> @@ -2284,7 +2284,8 @@ networkStartNetworkVirtual(struct network_driver *driver,
>          VIR_FORCE_CLOSE(tapfd);
>      }
>  
> -    if (virNetDevBandwidthSet(network->def->bridge, network->def->bandwidth) < 0) {
> +    if (virNetDevBandwidthSet(network->def->bridge,
> +                              network->def->bandwidth, true) < 0) {
>          virReportError(VIR_ERR_INTERNAL_ERROR,
>                         _("cannot set bandwidth limits on %s"),
>                         network->def->bridge);
> diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
> index 9009bd2..e10eb09 100644
> --- a/src/qemu/qemu_command.c
> +++ b/src/qemu/qemu_command.c
> @@ -292,7 +292,8 @@ qemuNetworkIfaceConnect(virDomainDefPtr def,
>  
>      if (tapfd >= 0 &&
>          virNetDevBandwidthSet(net->ifname,
> -                              virDomainNetGetActualBandwidth(net)) < 0) {
> +                              virDomainNetGetActualBandwidth(net),
> +                              false) < 0) {
>          virReportError(VIR_ERR_INTERNAL_ERROR,
>                         _("cannot set bandwidth limits on %s"),
>                         net->ifname);
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index d449579..e6ae3fd 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -9034,7 +9034,7 @@ qemuDomainSetInterfaceParameters(virDomainPtr dom,
>                     sizeof(*newBandwidth->out));
>          }
>  
> -        if (virNetDevBandwidthSet(net->ifname, newBandwidth) < 0) {
> +        if (virNetDevBandwidthSet(net->ifname, newBandwidth, false) < 0) {
>              virReportError(VIR_ERR_INTERNAL_ERROR,
>                             _("cannot set bandwidth limits on %s"),
>                             device);
> diff --git a/src/util/virnetdevbandwidth.c b/src/util/virnetdevbandwidth.c
> index 49fc425..71c272e 100644
> --- a/src/util/virnetdevbandwidth.c
> +++ b/src/util/virnetdevbandwidth.c
> @@ -45,17 +45,21 @@ virNetDevBandwidthFree(virNetDevBandwidthPtr def)
>   * virNetDevBandwidthSet:
>   * @ifname: on which interface
>   * @bandwidth: rates to set (may be NULL)
> + * @hierarchical_class: whether to create hierarchical class
>   *
>   * This function enables QoS on specified interface
>   * and set given traffic limits for both, incoming
>   * and outgoing traffic. Any previous setting get
> - * overwritten.
> + * overwritten. If @hierarchical_class is TRUE, create
> + * hierarchical class. It is used to guarantee minimal
> + * throughput ('floor' attribute in NIC).
>   *
>   * Return 0 on success, -1 otherwise.
>   */
>  int
>  virNetDevBandwidthSet(const char *ifname,
> -                      virNetDevBandwidthPtr bandwidth)
> +                      virNetDevBandwidthPtr bandwidth,
> +                      bool hierarchical_class)
>  {
>      int ret = -1;
>      virCommandPtr cmd = NULL;
> @@ -71,7 +75,7 @@ virNetDevBandwidthSet(const char *ifname,
>  
>      virNetDevBandwidthClear(ifname);
>  
> -    if (bandwidth->in) {
> +    if (bandwidth->in && bandwidth->in->average) {
>          if (virAsprintf(&average, "%llukbps", bandwidth->in->average) < 0)
>              goto cleanup;
>          if (bandwidth->in->peak &&
> @@ -83,15 +87,89 @@ virNetDevBandwidthSet(const char *ifname,
>  
>          cmd = virCommandNew(TC);
>          virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "root",
> -                             "handle", "1:", "htb", "default", "1", NULL);
> +                             "handle", "1:", "htb", "default",
> +                             hierarchical_class ? "2" : "1", NULL);
>          if (virCommandRun(cmd, NULL) < 0)
>              goto cleanup;
>  
> +        /* If we are creating a hierarchical class, all non guaranteed traffic
> +         * goes to the 1:2 class which will adjust 'rate' dynamically as NICs
> +         * with guaranteed throughput are plugged and unplugged. Class 1:1
> +         * exists so we don't exceed the maximum limit for the network. For each
> +         * NIC with guaranteed throughput a separate classid will be created.
> +         * NB '1:' is just a shorter notation of '1:0'.
> +         *
> +         * To get a picture how this works:
> +         *
> +         * +-----+     +---------+     +-----------+      +-----------+     +-----+
> +         * |     |     |  qdisc  |     | class 1:1 |      | class 1:2 |     |     |
> +         * | NIC |     | def 1:2 |     |   rate    |      |   rate    |     | sfq |
> +         * |     | --> |         | --> |   peak    | -+-> |   peak    | --> |     |
> +         * +-----+     +---------+     +-----------+  |   +-----------+     +-----+
> +         *                                            |
> +         *                                            |   +-----------+     +-----+
> +         *                                            |   | class 1:3 |     |     |
> +         *                                            |   |   rate    |     | sfq |
> +         *                                            +-> |   peak    | --> |     |
> +         *                                            |   +-----------+     +-----+
> +         *                                           ...
> +         *                                            |   +-----------+     +-----+
> +         *                                            |   | class 1:n |     |     |
> +         *                                            |   |   rate    |     | sfq |
> +         *                                            +-> |   peak    | --> |     |
> +         *                                                +-----------+     +-----+
> +         *
> +         * After the routing decision, when is it clear a packet is to be sent
> +         * via a particular NIC, it is sent to the root qdisc (queueing
> +         * discipline). In this case HTB (Hierarchical Token Bucket). It has
> +         * only one direct child class (with id 1:1) which shapes the overall
> +         * rate that is sent through the NIC.  This class has at least one child
> +         * (1:2) which is meant for all non-privileged (non guaranteed) traffic
> +         * from all domains. Then, for each interface with guaranteed
> +         * throughput, a separate class (1:n) is created. Imagine a class is a
> +         * box. Whenever a packet ends up in a class it is stored in this box
> +         * until the kernel sends it, then it is removed from box. Packets are
> +         * placed into boxes based on rules (filters) - e.g. depending on
> +         * destination IP/MAC address. If there is no rule to be applied, the
> +         * root qdisc has a default where such packets go (1:2 in this case).
> +         * Packets come in over and over again and boxes get filled more and
> +         * more. Imagine that kernel sends packets just once a second. So it
> +         * starts to traverse through this tree. It starts with the root qdisc
> +         * and through 1:1 it gets to 1:2. It sends packets up to 1:2's 'rate'.
> +         * Then it moves to 1:3 and again sends packets up to 1:3's 'rate'.  The
> +         * whole process is repeated until 1:n is processed. So now we have
> +         * ensured each class its guaranteed bandwidth. If the sum of sent data
> +         * doesn't exceed the 'rate' in 1:1 class, we can go further and send
> +         * more packets. The rest of available bandwidth is distributed to the
> +         * 1:2,1:3...1:n classes by ratio of their 'rate'. As soon as the root
> +         * 'rate' limit is reached or there are no more packets to send, we stop
> +         * sending and wait another second. Each class has an SFQ qdisc which
> +         * shuffles packets in boxes stochastically, so one sender cannot
> +         * starve others.
> +         *
> +         * Therefore, whenever we want to plug in a new guaranteed interface, we
> +         * need to create a new class and adjust the 'rate' of the 1:2 class.
> +         * When unplugging we do the exact opposite - remove the associated
> +         * class, and adjust the 'rate'.
> +         *
> +         * This description is rather long, but it is still a good idea to read
> +         * it before you dig into the code.
> +         */
> +        if (hierarchical_class) {
> +            virCommandFree(cmd);
> +            cmd = virCommandNew(TC);
> +            virCommandAddArgList(cmd, "class", "add", "dev", ifname, "parent",
> +                                 "1:", "classid", "1:1", "htb", "rate", average,
> +                                 "ceil", peak ? peak : average, NULL);
> +            if (virCommandRun(cmd, NULL) < 0)
> +                goto cleanup;
> +        }
>          virCommandFree(cmd);
>          cmd = virCommandNew(TC);
>          virCommandAddArgList(cmd,"class", "add", "dev", ifname, "parent",
> -                             "1:", "classid", "1:1", "htb", NULL);
> -        virCommandAddArgList(cmd, "rate", average, NULL);
> +                             hierarchical_class ? "1:1" : "1:", "classid",
> +                             hierarchical_class ? "1:2" : "1:1", "htb",
> +                             "rate", average, NULL);
>  
>          if (peak)
>              virCommandAddArgList(cmd, "ceil", peak, NULL);
> @@ -104,7 +182,8 @@ virNetDevBandwidthSet(const char *ifname,
>          virCommandFree(cmd);
>          cmd = virCommandNew(TC);
>          virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "parent",
> -                             "1:1", "handle", "2:", "sfq", "perturb",
> +                             hierarchical_class ? "1:2" : "1:1",
> +                             "handle", "2:", "sfq", "perturb",
>                               "10", NULL);
>  
>          if (virCommandRun(cmd, NULL) < 0)
> diff --git a/src/util/virnetdevbandwidth.h b/src/util/virnetdevbandwidth.h
> index 35f8b89..d308ab2 100644
> --- a/src/util/virnetdevbandwidth.h
> +++ b/src/util/virnetdevbandwidth.h
> @@ -42,7 +42,9 @@ struct _virNetDevBandwidth {
>  
>  void virNetDevBandwidthFree(virNetDevBandwidthPtr def);
>  
> -int virNetDevBandwidthSet(const char *ifname, virNetDevBandwidthPtr bandwidth)
> +int virNetDevBandwidthSet(const char *ifname,
> +                          virNetDevBandwidthPtr bandwidth,
> +                          bool hierarchical_class)
>      ATTRIBUTE_NONNULL(1) ATTRIBUTE_RETURN_CHECK;
>  int virNetDevBandwidthClear(const char *ifname)
>      ATTRIBUTE_NONNULL(1);
> diff --git a/src/util/virnetdevmacvlan.c b/src/util/virnetdevmacvlan.c
> index d8e646a..657c484 100644
> --- a/src/util/virnetdevmacvlan.c
> +++ b/src/util/virnetdevmacvlan.c
> @@ -925,7 +925,7 @@ create_name:
>          rc = 0;
>      }
>  
> -    if (virNetDevBandwidthSet(cr_ifname, bandwidth) < 0) {
> +    if (virNetDevBandwidthSet(cr_ifname, bandwidth, false) < 0) {
>          virReportError(VIR_ERR_INTERNAL_ERROR,
>                         _("cannot set bandwidth limits on %s"),
>                         cr_ifname);

ACK.




More information about the libvir-list mailing list