r21391 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r21390‎ | r21391 | r21392 >
Date:20:13, 19 April 2007
Author:river
Status:old
Tags:
Comment:
slayerd: regulate user memory usage by killing processes exceeding memory limit
Modified paths:
  • /trunk/tools/slayerd (added) (history)
  • /trunk/tools/slayerd/Makefile (added) (history)
  • /trunk/tools/slayerd/slayerd.cc (added) (history)

Diff [purge]

Index: trunk/tools/slayerd/slayerd.cc
@@ -0,0 +1,421 @@
 2+/* Copyright (c) 2007 River Tarnell <river@attenuate.org>. */
 3+/*
 4+ * Permission is granted to anyone to use this software for any purpose,
 5+ * including commercial applications, and to alter it and redistribute it
 6+ * freely. This software is provided 'as-is', without any express or implied
 7+ * warranty.
 8+ */
 9+
 10+/* $Id$ */
 11+
 12+/*
 13+ * slayerd: monitor user activity and regulate users using too much RAM.
 14+ */
 15+
 16+#include <string>
 17+#include <iostream>
 18+#include <fstream>
 19+#include <algorithm>
 20+#include <stdexcept>
 21+#include <vector>
 22+#include <map>
 23+#include <set>
 24+#include <cerrno>
 25+
 26+#include <sys/types.h>
 27+#include <sys/mman.h>
 28+#include <sys/stat.h>
 29+#include <unistd.h>
 30+#include <pwd.h>
 31+#include <signal.h>
 32+#include <syslog.h>
 33+
 34+#include <boost/filesystem/path.hpp>
 35+#include <boost/filesystem/operations.hpp>
 36+#include <boost/lexical_cast.hpp>
 37+#include <boost/format.hpp>
 38+
 39+namespace fs = boost::filesystem;
 40+
 41+namespace {
 42+ std::string PATH_PROC = "/proc";
 43+ std::string SENDMAIL = "/usr/lib/sendmail";
 44+}
 45+
 46+struct process {
 47+ process(fs::path const &pth);
 48+
 49+ pid_t _pid;
 50+ std::string _comm;
 51+ char _state;
 52+ pid_t _ppid;
 53+ pid_t _pgrp;
 54+ pid_t _sid;
 55+ int _tty;
 56+ pid_t _tpgid;
 57+ unsigned long _flags;
 58+ unsigned long _minflt;
 59+ unsigned long _cminflt;
 60+ unsigned long _majflt;
 61+ unsigned long _cmajflt;
 62+ unsigned long _utime;
 63+ unsigned long _stime;
 64+ long _cutime;
 65+ long _cstime;
 66+ long _priority;
 67+ long _itrealvalue;
 68+ long _starttime;
 69+ unsigned long _vsize;
 70+ long _rss;
 71+ unsigned long _rlim;
 72+ unsigned long _startcode;
 73+ unsigned long _endcode;
 74+ unsigned long _stackstart;
 75+ unsigned long _kstkesp;
 76+ unsigned long _kstkeip;
 77+ unsigned long _signal;
 78+ unsigned long _blocked;
 79+ unsigned long _sigignore;
 80+ unsigned long _sigcatch;
 81+ unsigned long _wchan;
 82+ unsigned long _nswap;
 83+ unsigned long _cnswap;
 84+ int _exit_signal;
 85+ int _processor;
 86+ unsigned long _rt_priority;
 87+ unsigned long _policy;
 88+ long _nice;
 89+ uid_t _uid;
 90+
 91+ void _read_proc_data(fs::path const &);
 92+};
 93+
 94+process::process(fs::path const &pth)
 95+ : _pid(boost::lexical_cast<pid_t>(pth.leaf()))
 96+{
 97+ struct stat st;
 98+ if (::stat(pth.native_directory_string().c_str(), &st) == -1)
 99+ throw std::runtime_error("could not stat proc dir");
 100+ _uid = st.st_uid;
 101+
 102+ _read_proc_data(pth);
 103+}
 104+
 105+void
 106+process::_read_proc_data(fs::path const &pth)
 107+{
 108+ std::ifstream f((pth / "stat").native_file_string().c_str());
 109+ std::string sline;
 110+
 111+ if (!f)
 112+ throw std::runtime_error("could not read line from stat");
 113+
 114+ long dummy;
 115+ if (!(f >> _pid >> _comm >> _state >> _ppid >> _pgrp >> _sid >> _tty >> _tpgid
 116+ >> _flags >> _minflt >> _cminflt >> _majflt >> _cmajflt >> _utime
 117+ >> _stime >> _cutime >> _cstime >> _priority >> _nice >> dummy >> _itrealvalue
 118+ >> _starttime >> _vsize >> _rss >> _rlim >> _startcode >> _endcode
 119+ >> _stackstart >> _kstkesp >> _kstkeip >> _signal >> _blocked >> _sigignore
 120+ >> _sigcatch >> _wchan >> _nswap >> _cnswap >> _exit_signal >> _processor
 121+ >> _rt_priority >> _policy
 122+ ))
 123+ throw std::runtime_error("could not parse stat line");
 124+}
 125+
 126+std::string
 127+username(uid_t uid)
 128+{
 129+ struct passwd *p;
 130+ if ((p = getpwuid(uid)) == 0)
 131+ return boost::lexical_cast<std::string>(uid);
 132+ return std::string(p->pw_name);
 133+}
 134+
 135+uid_t
 136+uid(std::string const &username)
 137+{
 138+ struct passwd *p;
 139+ if ((p = getpwnam(username.c_str())) == 0)
 140+ return -1;
 141+ return p->pw_uid;
 142+}
 143+
 144+template<typename C>
 145+struct directory_enumerator {
 146+ C &list;
 147+
 148+ directory_enumerator(C &list) : list(list) {}
 149+
 150+ void operator() (fs::path const &pth) const {
 151+ /*
 152+ * Ensure it is actually a pid.
 153+ */
 154+ try {
 155+ boost::lexical_cast<pid_t>(pth.leaf());
 156+ } catch (boost::bad_lexical_cast const &) {
 157+ return;
 158+ }
 159+
 160+ try {
 161+ list.push_back(process(pth));
 162+ } catch (...) {}
 163+ }
 164+};
 165+
 166+template<typename C>
 167+directory_enumerator<C>
 168+enumerate_directory(C &list) {
 169+ return directory_enumerator<C>(list);
 170+}
 171+
 172+struct user {
 173+ user() : uid(-1), rss(0) {}
 174+
 175+ uid_t uid;
 176+ unsigned long rss;
 177+ std::vector<process> processes;
 178+};
 179+
 180+/*
 181+ * A sort comparator that uses a particular struct field.
 182+ */
 183+template<typename S, typename T, T (S::*F)>
 184+bool
 185+field_comparator(S const &a, S const &b)
 186+{
 187+ return b.*F < a.*F;
 188+}
 189+
 190+void
 191+version(void) {
 192+ std::cerr << "slayerd $Revision$\n";
 193+ std::cerr << "Copyright (C) 2007, River Tarnell <river@attenuate.org>.\n";
 194+}
 195+
 196+void
 197+usage(void) {
 198+ std::cerr <<
 199+"usage: slayerd [-vh] -l <limit> -t <thread> [-e <user>]\n"
 200+;
 201+}
 202+
 203+void
 204+log(std::string const &m)
 205+{
 206+ syslog(LOG_NOTICE, "%s", m.c_str());
 207+}
 208+
 209+void
 210+sendmail(std::string const &username, std::string const &message)
 211+{
 212+ std::string cmd = str(boost::format("%s -oi -bm -- %s") % SENDMAIL % username);
 213+ FILE *p = popen(cmd.c_str(), "w");
 214+ if (p == 0) {
 215+ log(str(boost::format("cannot send mail using %s: %s") % SENDMAIL % std::strerror(errno)));
 216+ return;
 217+ }
 218+
 219+ fwrite(message.data(), message.size(), 1, p);
 220+ pclose(p);
 221+}
 222+
 223+int
 224+main(int argc, char **argv)
 225+{
 226+ int delay = 10, pagesize = sysconf(_SC_PAGE_SIZE);
 227+ std::size_t limit = 0, thresh = 0;
 228+ int c;
 229+ std::set<uid_t> exempt;
 230+
 231+ char nodename[255];
 232+ gethostname(nodename, sizeof nodename);
 233+
 234+ while ((c = getopt(argc, argv, "l:t:e:vh")) != -1) {
 235+ switch (c) {
 236+ case 'l':
 237+ try {
 238+ limit = boost::lexical_cast<std::size_t>(optarg) * 1024 * 1024;
 239+ } catch (boost::bad_lexical_cast &) {
 240+ std::cerr << boost::format("\"%s\" is not a valid number\n") % optarg;
 241+ return 1;
 242+ }
 243+ break;
 244+
 245+ case 't':
 246+ try {
 247+ thresh = boost::lexical_cast<std::size_t>(optarg) * 1024 * 1024;
 248+ } catch (boost::bad_lexical_cast &) {
 249+ std::cerr << boost::format("\"%s\" is not a valid number\n") % optarg;
 250+ return 1;
 251+ }
 252+ break;
 253+
 254+ case 'd':
 255+ try {
 256+ delay = boost::lexical_cast<std::size_t>(optarg);
 257+ } catch (boost::bad_lexical_cast &) {
 258+ std::cerr << boost::format("\"%s\" is not a valid number\n") % optarg;
 259+ return 1;
 260+ }
 261+ break;
 262+
 263+ case 'e':
 264+ uid_t u;
 265+ if ((u = uid(optarg)) == -1) {
 266+ std::cerr << boost::format("user \"%s\" does not exist\n") % optarg;
 267+ return 1;
 268+ }
 269+ exempt.insert(u);
 270+ break;
 271+
 272+ case 'v':
 273+ version();
 274+ return 0;
 275+
 276+ case 'h':
 277+ version();
 278+ usage();
 279+ return 0;
 280+
 281+ default:
 282+ version();
 283+ usage();
 284+ return 1;
 285+ }
 286+ }
 287+ argc -= optind;
 288+ argv += optind;
 289+
 290+ if (limit == 0 || thresh == 0) {
 291+ usage();
 292+ return 1;
 293+ }
 294+
 295+ openlog("slayerd", LOG_PID, LOG_DAEMON);
 296+
 297+ if (daemon(0, 0) == -1) {
 298+ std::cerr << boost::format("cannot daemonise: %s\n") % std::strerror(errno);
 299+ return 1;
 300+ }
 301+
 302+ if (mlockall(MCL_CURRENT | MCL_FUTURE) == -1)
 303+ log(str(boost::format("warning: cannot lock memory: %s\n") % std::strerror(errno)));
 304+
 305+ log(str(boost::format("delay: %d, limit: %dM, threshold: %dM\n")
 306+ % delay % (limit / 1024 / 1024) % (thresh / 1024 / 1024)));
 307+
 308+ for (;;) {
 309+ fs::path proc(PATH_PROC);
 310+ std::vector<process> processes;
 311+
 312+ std::for_each(fs::directory_iterator(proc), fs::directory_iterator(),
 313+ enumerate_directory(processes));
 314+
 315+ /*
 316+ * Aggregate the processes by user.
 317+ */
 318+ std::vector<user> users;
 319+ for (std::size_t i = 0, end = processes.size(); i < end; ++i) {
 320+ process &p = processes[i];
 321+ user *u = 0;
 322+
 323+ for (std::size_t ui = 0, uend = users.size(); ui != uend; ++ui)
 324+ if (users[ui].uid == p._uid) {
 325+ u = &users[ui];
 326+ break;
 327+ }
 328+
 329+ if (u == 0) {
 330+ std::size_t n = users.size();
 331+ users.resize(n + 1);
 332+ users[n].uid = p._uid;
 333+ u = &users[n];
 334+ }
 335+
 336+ u->rss += p._rss;
 337+ u->processes.push_back(p);
 338+ }
 339+
 340+ /*
 341+ * Sort user by RSS.
 342+ */
 343+ std::sort(users.begin(), users.end(), field_comparator<user, unsigned long, &user::rss>);
 344+
 345+ for (std::size_t i = 0, end = users.size(); i < end; ++i) {
 346+ user &u = users[i];
 347+ std::size_t bytes = u.rss * pagesize;
 348+
 349+ if (exempt.find(u.uid) != exempt.end())
 350+ continue;
 351+
 352+ if (bytes < limit)
 353+ continue;
 354+
 355+ std::string uname = username(u.uid);
 356+ std::string message = str(boost::format(
 357+"From: slayerd <slayerd@%1%>\n"
 358+"To: %2% <%2%@%1%>\n"
 359+"Subject: Excessive memory usage from your processes.\n"
 360+"Reply-To: Wikimedia Toolserver Administrators <ts-admins@wikimedia.org>\n"
 361+"X-Mailer: slayerd $Revision$\n"
 362+"\n"
 363+"This message was automatically generated by slayerd on %1%.\n"
 364+"\n"
 365+"Hello,\n"
 366+"\n"
 367+"One or more of your processes on the host %1%\n"
 368+"were exceeding the configured memory limit, which is %3% megabytes.\n"
 369+"I have killed enough of your processes to bring your usage back to the\n"
 370+"threshold limit, which is %4% megabytes.\n"
 371+"\n"
 372+"These are the processes I killed:\n"
 373+"\n"
 374+ ) % nodename % uname % (limit / 1024 / 1024) % (thresh / 1024 / 1024));
 375+
 376+ log(str(boost::format("user \"%s\" is using %dM, over configured limit %dM")
 377+ % uname
 378+ % (bytes / 1024 / 1024)
 379+ % (limit / 1024 / 1024)));
 380+
 381+ std::sort(u.processes.begin(), u.processes.end(), field_comparator<process, long, &process::_rss>);
 382+
 383+ while (bytes >= thresh && !u.processes.empty()) {
 384+ process &p = u.processes[0];
 385+ std::string comm = p._comm.substr(1);
 386+ comm.resize(comm.size() - 1);
 387+
 388+ kill(p._pid, SIGKILL);
 389+
 390+ log(str(boost::format(" killed process \"%s\" (pid %d) using %dM, usage now %dM")
 391+ % comm % p._pid
 392+ % (p._rss * pagesize / 1024 / 1024)
 393+ % ((bytes - p._rss * pagesize) / 1024 / 1024)));
 394+
 395+ message += str(boost::format(" %s (pid %d), using %d megabyte(s)\n")
 396+ % comm % p._pid % (p._rss * pagesize / 1024 / 1024));
 397+
 398+ bytes -= p._rss * pagesize;
 399+ u.processes.erase(u.processes.begin());
 400+ }
 401+
 402+ log(str(boost::format(" usage is now within acceptable limits (%dM)")
 403+ % (bytes / 1024 / 1024)));
 404+
 405+ message += str(boost::format(
 406+"\n"
 407+"Your total memory usage is now %d megabyte(s).\n"
 408+"\n"
 409+"Excessive memory usage is usually a symptom of a broken program. Please\n"
 410+"investigate the cause of the problem and fix it before you restart these\n"
 411+"processes.\n"
 412+"\n"
 413+"Regards,\n"
 414+" slayerd (the process slayer)\n"
 415+ ) % (bytes / 1024 / 1024));
 416+
 417+ sendmail(uname, message);
 418+ }
 419+
 420+ sleep(delay);
 421+ }
 422+}
Index: trunk/tools/slayerd/Makefile
@@ -0,0 +1,16 @@
 2+CXX = g++
 3+CXXFLAGS = -O2 -g3 -ggdb
 4+LDFLAGS =
 5+SRCS = slayerd.cc
 6+OBJS = $(SRCS:.cc=.o)
 7+
 8+slayerd: $(OBJS)
 9+ $(CXX) $(CXXFLAGS) $(LDFLAGS) $^ -o $@ -lboost_filesystem
 10+
 11+install: slayerd
 12+ install -c -s -o root -g root -m 755 slayerd /usr/local/sbin
 13+
 14+.cc.o:
 15+ $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $<
 16+
 17+.SUFFICES: .cc .o