r37389 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r37388‎ | r37389 | r37390 >
Date:14:45, 9 July 2008
Author:mark
Status:old
Tags:
Comment:
First test version with intended functionality.
Modified paths:
  • /trunk/routing/lvs/net (added) (history)
  • /trunk/routing/lvs/net/ipv4 (added) (history)
  • /trunk/routing/lvs/net/ipv4/ipvs (added) (history)
  • /trunk/routing/lvs/net/ipv4/ipvs/ip_vs_wcsh.c (added) (history)

Diff [purge]

Index: trunk/routing/lvs/net/ipv4/ipvs/ip_vs_wcsh.c
@@ -0,0 +1,335 @@
 2+/*
 3+ * IPVS: Weighted Consistent Source Hashing scheduling module
 4+ *
 5+ * Version: $Id$
 6+ *
 7+ * Authors: Mark Bergsma <mark@nedworks.org>
 8+ *
 9+ * This program is free software; you can redistribute it and/or
 10+ * modify it under the terms of the GNU General Public License
 11+ * as published by the Free Software Foundation; either version
 12+ * 2 of the License, or (at your option) any later version.
 13+ *
 14+ * Changes:
 15+ *
 16+ */
 17+
 18+/*
 19+ * The wcsh algorithm uses a consistent hashing scheme that is similar to the
 20+ * one used in e.g. libketama and some DHTs: each destination address is
 21+ * hashed one or more times and placed on a circle (the continuum) which
 22+ * represents a 16 bit integer space. A new connection's source IP address
 23+ * is hashed, and the smallest destination hash greater than or equal to the
 24+ * source address hash is selected. If the selected server is unavailable
 25+ * (down, overloaded) then the next is tried, and so on.
 26+ *
 27+ * Destination load weighting is implemented by placing a number of points on
 28+ * the continuum that is a multiple of the destination weight value: the
 29+ * higher the weight, the more points on the continuum, and the higher the
 30+ * probability that this destination is selected if the source IP address
 31+ * and destination addresse hashes are uniformly distributed over the circle.
 32+ *
 33+ */
 34+
 35+#include <linux/module.h>
 36+#include <linux/kernel.h>
 37+#include <linux/ip.h>
 38+
 39+#include <net/ip_vs.h>
 40+
 41+/*
 42+ * Continuum size
 43+ */
 44+#ifndef CONFIG_IP_VS_WCSH_CONT_BITS
 45+#define CONFIG_IP_VS_WCSH_CONT_BITS 16
 46+#endif
 47+#define IP_VS_WCSH_CONT_BITS CONFIG_IP_VS_WCSH_CONT_BITS
 48+#define IP_VS_WCSH_CONT_SIZE (1 << IP_VS_WCSH_CONT_BITS)
 49+#define IP_VS_WCSH_CONT_MASK (IP_VS_WCSH_CONT_SIZE - 1)
 50+
 51+struct ip_vs_wcsh_dest_point {
 52+ struct list_head n_list; /* sorted linked list of points */
 53+ __u32 value; /* point value */
 54+ struct ip_vs_dest *dest; /* destination server */
 55+};
 56+
 57+struct ip_vs_wcsh_data {
 58+ struct list_head continuum; /* continuum linked list */
 59+ struct ip_vs_wcsh_dest_point *cntcache; /* pointer to allocated memory for continuum entries */
 60+};
 61+
 62+/*
 63+ * Returns a hash value for a host order IPv4 address
 64+ */
 65+static inline unsigned ip_vs_wcsh_hashkey(const unsigned addr)
 66+{
 67+ return addr * 2654435761UL;
 68+}
 69+
 70+static int ip_vs_wcsh_alloc_continuum(struct ip_vs_wcsh_data *sched_data, const struct ip_vs_service *svc)
 71+{
 72+ struct ip_vs_dest *dest;
 73+ int pointcount = 0;
 74+
 75+ /*
 76+ * Determine how many points the continuum will consist off,
 77+ * the sum of all destination weights
 78+ */
 79+ list_for_each_entry(dest, &svc->destinations, n_list) {
 80+ pointcount += atomic_read(&dest->weight);
 81+ }
 82+
 83+ /* Allocate memory for the continuum */
 84+ sched_data->cntcache = kmalloc(sizeof(struct ip_vs_wcsh_dest_point)*pointcount, GFP_ATOMIC);
 85+ if (sched_data->cntcache == NULL) {
 86+ IP_VS_ERR("ip_vs_wcsh_create_continuum(): no memory\n");
 87+ return -ENOMEM;
 88+ }
 89+ IP_VS_DBG(2, "[wcsh] Created continuum of %d points, "
 90+ "using %Zd bytes of memory\n", pointcount,
 91+ sizeof(struct ip_vs_wcsh_dest_point)*pointcount);
 92+
 93+ return 0;
 94+}
 95+
 96+/*
 97+ * Inserts a new continuum point @point at the right position such that
 98+ * an ascending order of hash values is maintained in the linked list @continuum
 99+ */
 100+static void ip_vs_wcsh_insert_sorted(struct ip_vs_wcsh_dest_point *new_point, struct list_head *head)
 101+{
 102+ struct ip_vs_wcsh_dest_point *point;
 103+ struct list_head *pos;
 104+
 105+ /* Is there no cleaner way to do this? */
 106+
 107+ if (list_empty(head) || new_point->value < list_first_entry(head, struct ip_vs_wcsh_dest_point, n_list)->value) {
 108+ list_add(&new_point->n_list, head);
 109+ }
 110+ else {
 111+ list_for_each(pos, head) {
 112+ point = list_entry(pos, struct ip_vs_wcsh_dest_point, n_list);
 113+ if (point->value <= new_point->value
 114+ && (list_is_last(pos, head)
 115+ || new_point->value < list_entry(pos->next, struct ip_vs_wcsh_dest_point, n_list)->value)) {
 116+ /* Insert after pos */
 117+ __list_add(&new_point->n_list, pos, pos->next);
 118+ return;
 119+ }
 120+ }
 121+ }
 122+}
 123+
 124+/*
 125+ * Create the continuum by hashing each server multiple times
 126+ * (determined by weight) onto the circle
 127+ */
 128+static void ip_vs_wcsh_create_continuum(struct ip_vs_wcsh_data *sched_data, const struct ip_vs_service *svc)
 129+{
 130+ struct ip_vs_wcsh_dest_point *point;
 131+ struct ip_vs_dest *dest;
 132+
 133+ if (svc->num_dests == 0)
 134+ return;
 135+
 136+ point = sched_data->cntcache;
 137+
 138+ list_for_each_entry(dest, &svc->destinations, n_list) {
 139+ unsigned hashkey;
 140+ int weight, i;
 141+
 142+ /* Hash this destination n times, where n = dest->weight */
 143+ weight = atomic_read(&dest->weight);
 144+ hashkey = ntohl(dest->addr);
 145+ for (i=0; i < weight; i++) {
 146+ hashkey = ip_vs_wcsh_hashkey(hashkey);
 147+
 148+ point->value = hashkey & IP_VS_WCSH_CONT_MASK;
 149+ atomic_inc(&dest->refcnt);
 150+ point->dest = dest;
 151+
 152+ IP_VS_DBG(6, "[wcsh] Continuum point %d created for server %u.%u.%u.%u:%d [%u]\n",
 153+ i, NIPQUAD(dest->addr), ntohs(dest->port), point->value);
 154+
 155+ ip_vs_wcsh_insert_sorted(point, &sched_data->continuum);
 156+ point++;
 157+ }
 158+ }
 159+}
 160+
 161+static void ip_vs_wcsh_flush_continuum(struct ip_vs_wcsh_data *sched_data, const struct ip_vs_service *svc)
 162+{
 163+ struct ip_vs_wcsh_dest_point *point;
 164+ struct list_head *p, *q;
 165+
 166+ if (list_empty(&sched_data->continuum))
 167+ return;
 168+
 169+ list_for_each_safe(p, q, &sched_data->continuum) {
 170+ point = list_entry(p, struct ip_vs_wcsh_dest_point, n_list);
 171+ atomic_dec(&point->dest->refcnt);
 172+ point->dest = NULL;
 173+ list_del(p);
 174+ }
 175+
 176+ kfree(sched_data->cntcache);
 177+ sched_data->cntcache = NULL;
 178+}
 179+
 180+static int ip_vs_wcsh_init_svc(struct ip_vs_service *svc)
 181+{
 182+ struct ip_vs_wcsh_data * sched_data;
 183+
 184+ sched_data = kmalloc(sizeof(struct ip_vs_wcsh_data), GFP_ATOMIC);
 185+ if (sched_data == NULL) {
 186+ IP_VS_ERR("ip_vs_wcsh_init_svc(): no memory\n");
 187+ return -ENOMEM;
 188+ }
 189+ INIT_LIST_HEAD(&sched_data->continuum);
 190+ svc->sched_data = sched_data;
 191+
 192+ if (svc->num_dests > 0) {
 193+ int res = ip_vs_wcsh_alloc_continuum(sched_data, svc);
 194+ if (res < 0)
 195+ return res;
 196+ ip_vs_wcsh_create_continuum(sched_data, svc);
 197+ }
 198+
 199+ IP_VS_DBG(6, "[wcsh] Scheduler initialized\n");
 200+ return 0;
 201+}
 202+
 203+static int ip_vs_wcsh_done_svc(struct ip_vs_service *svc)
 204+{
 205+ struct ip_vs_wcsh_data *sched_data = svc->sched_data;
 206+
 207+ ip_vs_wcsh_flush_continuum(sched_data, svc);
 208+ kfree(svc->sched_data);
 209+
 210+ IP_VS_DBG(6, "[wcsh] Scheduler cleanup done\n");
 211+ return 0;
 212+}
 213+
 214+static int ip_vs_wcsh_update_svc(struct ip_vs_service *svc)
 215+{
 216+ struct ip_vs_wcsh_data *sched_data = svc->sched_data;
 217+
 218+ ip_vs_wcsh_flush_continuum(sched_data, svc);
 219+ if (svc->num_dests > 0) {
 220+ int res = ip_vs_wcsh_alloc_continuum(sched_data, svc);
 221+ if (res < 0)
 222+ return res;
 223+ ip_vs_wcsh_create_continuum(sched_data, svc);
 224+ }
 225+
 226+ IP_VS_DBG(6, "[wcsh] Service updated\n");
 227+ return 0;
 228+}
 229+
 230+/*
 231+ * Source Hashing scheduling
 232+ */
 233+static inline int ip_vs_wcsh_is_feasible(const struct ip_vs_dest *dest) {
 234+ return dest
 235+ && (dest->flags & IP_VS_DEST_F_AVAILABLE)
 236+ && !(dest->flags & IP_VS_DEST_F_OVERLOAD);
 237+}
 238+
 239+static struct ip_vs_wcsh_dest_point *
 240+ip_vs_wcsh_get_nearest_point(const unsigned hashkey, const struct list_head *head)
 241+{
 242+ struct ip_vs_wcsh_dest_point *point;
 243+
 244+ if (unlikely(list_empty(head)))
 245+ return NULL;
 246+ else list_for_each_entry(point, head, n_list) {
 247+ IP_VS_DBG(6, "[wcsh] Evaluating destination %u.%u.%u.%u:%d [%u]\n",
 248+ NIPQUAD(point->dest->addr), ntohs(point->dest->port), point->value);
 249+ if (unlikely(hashkey <= point->value))
 250+ return point;
 251+ }
 252+
 253+ /* hashkey is bigger than all point values, so wrap around and return the lowest value */
 254+ return list_first_entry(head, struct ip_vs_wcsh_dest_point, n_list);
 255+}
 256+
 257+static struct ip_vs_wcsh_dest_point *
 258+ip_vs_wcsh_get_feasible_dest(struct ip_vs_wcsh_dest_point *start, const struct list_head *head, int (*feasible)(const struct ip_vs_dest*))
 259+{
 260+ struct ip_vs_wcsh_dest_point *point = start;
 261+
 262+ if (unlikely(list_empty(head)))
 263+ return NULL;
 264+ else while (1) {
 265+ list_for_each_entry_from(point, head, n_list) {
 266+ IP_VS_DBG(6, "[wcsh] Evaluating feasibility of server %u.%u.%u.%u:%d [%u]: %d\n",
 267+ NIPQUAD(point->dest->addr), ntohs(point->dest->port), point->value,
 268+ (*feasible)(point->dest));
 269+ if (likely((*feasible)(point->dest)))
 270+ return point;
 271+ else if (unlikely((point->n_list.next == &start->n_list))) /* FIXME: messy */
 272+ return NULL; /* No feasible servers available */
 273+ }
 274+ /* Start over from the beginning of the list */
 275+ point = list_first_entry(head, struct ip_vs_wcsh_dest_point, n_list);
 276+ }
 277+}
 278+
 279+static struct ip_vs_dest *
 280+ip_vs_wcsh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 281+{
 282+ struct ip_vs_wcsh_dest_point *point;
 283+ struct ip_vs_dest *dest;
 284+ struct iphdr *iph = ip_hdr(skb);
 285+ struct ip_vs_wcsh_data *sched_data = svc->sched_data;
 286+ unsigned hashkey;
 287+
 288+ hashkey = ip_vs_wcsh_hashkey(ntohl(iph->saddr)) & IP_VS_WCSH_CONT_MASK;
 289+
 290+ /* Find the nearest server point with a hash value bigger than or equal to hashkey */
 291+ point = ip_vs_wcsh_get_nearest_point(hashkey, &sched_data->continuum);
 292+ if (unlikely(point == NULL))
 293+ return NULL;
 294+
 295+ /* Find the nearest feasible destination */
 296+ point = ip_vs_wcsh_get_feasible_dest(point, &sched_data->continuum, ip_vs_wcsh_is_feasible);
 297+ if (unlikely(point == NULL))
 298+ return NULL;
 299+
 300+ dest = point->dest;
 301+ IP_VS_DBG(6, "[wcsh] Source IP address %u.%u.%u.%u [%u] "
 302+ "--> server %u.%u.%u.%u:%d [%u]\n",
 303+ NIPQUAD(iph->saddr), hashkey,
 304+ NIPQUAD(dest->addr), ntohs(dest->port), point->value);
 305+
 306+ return dest;
 307+}
 308+
 309+/*
 310+ * IPVS WCSH Scheduler structure
 311+ */
 312+static struct ip_vs_scheduler ip_vs_wcsh_scheduler =
 313+{
 314+ .name = "wcsh",
 315+ .refcnt = ATOMIC_INIT(0),
 316+ .module = THIS_MODULE,
 317+ .init_service = ip_vs_wcsh_init_svc,
 318+ .done_service = ip_vs_wcsh_done_svc,
 319+ .update_service = ip_vs_wcsh_update_svc,
 320+ .schedule = ip_vs_wcsh_schedule,
 321+};
 322+
 323+static int __init ip_vs_wcsh_init(void)
 324+{
 325+ INIT_LIST_HEAD(&ip_vs_wcsh_scheduler.n_list);
 326+ return register_ip_vs_scheduler(&ip_vs_wcsh_scheduler);
 327+}
 328+
 329+static void __exit ip_vs_wcsh_cleanup(void)
 330+{
 331+ unregister_ip_vs_scheduler(&ip_vs_wcsh_scheduler);
 332+}
 333+
 334+module_init(ip_vs_wcsh_init);
 335+module_exit(ip_vs_wcsh_cleanup);
 336+MODULE_LICENSE("GPL");

Status & tagging log