check_access_log.pl 10

Posted by Peter Burkholder Sat, 04 Dec 2010 02:57:00 GMT

#!/usr/bin/perl 

# This plugin processes a logfile ( -l ) in the httpd common access format and
# report on all status entries for last ( -m ) minutes
use strict;
use warnings;

use File::ReadBackwards;
use Date::Manip;
use Nagios::Plugin;
use vars qw($VERSION $PROGNAME  $verbose $warn $critical $timeout $result);
$VERSION = '1.0';

# get the base name of this script for use in the examples
use File::Basename;
$PROGNAME = basename($0);

sub parse {
# Lifted from
# http://cpansearch.perl.org/src/MSLAGLE/Parse-AccessLogEntry-0.06/AccessLogEntry.pm
        my $Line=shift;
        my $Ref;
        my $Rest;
        my $R2;
        ($Ref->{host},$Ref->{user},$Ref->{date},$Rest)= $Line =~ m,^([^\s]+)\s+-\s+([^ ]+)\s+\[(.*?)\]\s+(.*),;
        my @Dsplit=split(/\s+/,$Ref->{date});
        $Ref->{diffgmt}=$Dsplit[1];
        my @Ds2=split(/\:/,$Dsplit[0],2);
        $Ref->{date}=$Ds2[0];
        $Ref->{time}=$Ds2[1];
        if ($Rest) {
                ($Ref->{rtype},$Ref->{file},$Ref->{proto},$Ref->{code},$Ref->{bytes},$R2)=split(/\s/,$Rest,6);
# Removing following processing improves run by 10% - PDB
#        $Ref->{rtype}=~tr/\"//d;
#        $Ref->{proto}=~tr/\"//d;
#                if ($R2)
#                {
#                        my @Split=split(/\"/,$R2);
#                        $Ref->{refer}=$Split[1];
#                        $Ref->{agent}=$Split[3];
#                }
        }
        return $Ref;
}

my %regex_for = (
    '20X'   => '2\d\d',
    '30X'   => '3\d\d',
    '403'   => '403',
    '404'   => '404',
    '40X'   => '4\d[^3-4]',
    '500'   => '500',
    '503'   => '503',
    '50X'   => '50(1|2|4|5)',
    'no_status'  => '-',
    );

# use Nagios::Plugin::Getopt to process the @ARGV command line options:
#   --verbose, --help, --usage, --timeout and --host are defined automatically.
my $np = Nagios::Plugin->new( 
    shortname => "ACCESS_STATUS",  
    usage => "Usage: %s [ -v|--verbose ] -l|--logfile=file -m|--m=minutes " .
    "[ -c|--critical=<threshold>(20)  ] [ -w|--warning=<threshold>(10) ] ".
    "[ -a|--activity=number_of_lines (100) ]",
    blurb => "Report status summary for the last minutes of an httpd access log" 
);

# Parse arguments and process standard ones (e.g. usage, help, version)
$np->add_arg(
        spec => 'logfile|l=s',
        help => qq{-l, --logfile=STRING},
        required => 1,
);
$np->add_arg(
        spec => 'minutes|m=i',
        help => qq{-m, --minutes=INTEGER},
        default => 5,
);
$np->add_arg(
        spec => 'debug|D+',
        help => qq{-D, --debug},
);
$np->add_arg(
    spec => 'warning|w=i',
    help => qq{-w, --warning=FLOAT Warn when % failures is above the specified threshold},
    default => 10,
);
$np->add_arg(
    spec => 'critical|c=i',
    help => qq{-c, --critical=FLOAT Critical when % failures is above the specified threshold},
    default => 20,
);

$np->add_arg(
    spec => 'activity|a=i',
    help => qq{-a, --activity=INTEGER Only go to critical or warning if log activity exceeds threshold},
    default => 100,
);
$np->getopts;

my $threshold = $np->set_thresholds(
    warning     => $np->opts->warning,
    critical    => $np->opts->critical,
);
my $log_file= $np->opts->logfile or $np->nagios_die("No input logfile defined");
my $minutes = $np->opts->minutes;
my $DEBUG   = $np->opts->debug;
my $start   = DateCalc ("epoch ".time(),"$minutes minutes ago");
#my $start   = "2010092012:59:30"; # sample for multiple runs on same file

my %status_score;   # initialize score card
my $lines=0;
my $bytes_served=0;
my $matched='';

tie *BW, 'File::ReadBackwards', $log_file or 
    $np->nagios_die("can't read $log_file $!") ;

print STDERR "Opening $log_file\n" if $np->opts->verbose;

# Start looping backward thru logfile
my $prior_logdate="";    # Set prior lines' logdate to nothing
while (<BW>) {
    my $line_ref    = parse($_);
    # skip tests from load-balancers
    next if ($line_ref->{host} =~ m/^192\.168\.30\.1/);                                                               
    if ( ( $lines %1000 == 0 ) and $np->opts->verbose) {
        print STDERR "line: $lines\n";
        print $_;
    }
    $lines++;
    my $logdate     = $line_ref->{date} . " " . $line_ref->{time};
    if ( $logdate eq $prior_logdate || ParseDate($logdate) gt $start ) {
        $prior_logdate = $logdate;
        my $status = $line_ref->{code};
        print $_ if (($status eq '-' ) and  $np->opts->verbose);
        $matched='false';
        $bytes_served += $line_ref->{bytes} if ($line_ref->{bytes} ne '-');
        foreach my $status_key (keys %regex_for) {
            if ( $status =~ m/$regex_for{$status_key}/x ) {
                $status_score{$status_key}++;
                $matched='true';
                last;               # break out of status_key foreach
            }
        }
        $status_score{'other'}++ if ( $matched eq 'false' );
    } else {
        last;                       # break out of <BW>;
    }
}

my $failure_rate=
    sprintf("%0.2f", 100 - (($status_score{'20X'} + $status_score{'30X'})/$lines) * 100.0);

$np->add_perfdata(
    label   => 'Total',
    value   => $lines,
);

$np->add_perfdata(
    label   => 'FAIL',
    value   =>  $failure_rate,
    uom     => '%',
    threshold => $threshold,
);

$np->add_perfdata(
    label   => 'BytesServed',
    uom     => 'kB',
    value   => 
        int($bytes_served/1024),

);

foreach my $key (sort keys %regex_for) {
    $np->add_perfdata(
        label   => $key,
        value   => $status_score{$key}||0,
    );
}

$np->add_perfdata(
    label   => 'Other',
    value   => $status_score{'other'}||0,
);

# We will return OK for this check
# unless the number of lines exceeds the activity threshold
my $return_code = 0;  
if ( $lines > $np->opts->activity ) {
    $return_code = $np->check_threshold($failure_rate);
} 

#return_code => $return_code,
$np->nagios_exit( 
     return_code => $return_code,
     message => "FAILURE_RATE $failure_rate" 
);
<pre><code>

      

Graphing Access Log Status with PNP4Nagios, part two 16

Posted by Peter Burkholder Sat, 04 Dec 2010 02:10:00 GMT

The graphs I showed in part one are generated by performance output from a Perl script, check_access_log.pl (attached to this post). The performance data are then captured by pnp4nagios and saved into an RRD file. The default graphing templates provided with pnp4nagios v0.4 didn’t do justice to how interest these data are, so I wrote my own template (to be attached to a future post).

The usage for check_access_log.pl is:

Usage: check_log3.pl [ -v|--verbose ] -l|--logfile=file -m|--m=minutes [ -c|--critical=<threshold>(20) ] [ -w|--warning=<threshold>(10) ] [ -a|--activity=number_of_lines (100) ]

where:
  • logfile is the path to your httpd access log
  • minutes is how far back from the present you want to parse your log file (5 minutes by default)
  • -c|-w represent the percentage of non-200 statuses are considered problematic
  • --activity: only go to warning/critical if activity exceeds this number of lines per check. This prevent the check from going to warning/critical if you’ve taken it out of service.
The script requires:
  • File::ReadBackwards; # allows the script to read backwards until the time period criterion is met
  • Date::Manip;
  • Nagios::Plugin;

Generally, the live httpd access logs will not be on your Nagios server so you’ll need to invoke this over NRPE.

Graphing Access Log Status with PNP4Nagios, part one 18

Posted by Peter Burkholder Mon, 25 Oct 2010 01:20:00 GMT

When I started at $WORK we had little by way of a comprehensive monitoring system, and rectifying that quickly became my top priority. I rolled out Nagios, and needing a way to trend historical performance data, added "pnp4nagios"http://docs.pnp4nagios.org/pnp-0.4/start" on top of that. Over time I had a real desire to see what were the breakdown of 200s, 403s, 404s, 500s and other status codes being logged by our various web servers. What I wanted were graphs like this (oops, no traffic on the 20th),

 

or something that illustrated that it was okay when our backend Day Communique CQ5.3 publisher was running 80% error rates late in the afternoon (because our front-ends had cached all the 200 OK objects, but the 404s were still getting passed to the backend; in the morning, the publications flushed the front-end cache, and most 200 would go to the backend again):

 

Anyhow, setting up PNP4Nagios to generate these lovely, zoomable graphs will be the focus of my next post.

Bringing Devops to my Day Job 4

Posted by Peter Burkholder Mon, 25 Oct 2010 00:27:00 GMT

I’ve been following the "DevOps movement", since I was first turned on to Paul Hammond and John Allspaw’s "Velocity 2009 talk: 10+ Deploys Per Day: Dev and Ops Cooperation at Flickr":velocityconference.blip.tv/file/2284377/. Many of the precepts of their talk: infrastructure automation, automated deploys, version control, and shared metrics, have been part of my approach to system administration for some years now (albeit never to the extent that Flickr embraces these matters). So much of what they say, and what I see elsewhere in DevOps from following Chef and Puppet, resonates with an approach that I’m naturally tuned to. Not so much my workplace, despite nearly two years of quiet advocacy. Now, however, my workplace has come through the latest major release with more interest in evaluating how we do things than was the case during this last year’s headlong rush to a new architecture and infrastructure. So I scheduled a lunchtime session to view and discuss Hammond and Allspaw’s talk, and was pleased to have over two dozen people show up, and another handful call in to the conference line. Further, our senior VP watched the talk while he was on travel and responded that we should take seriously the direction in which Allspaw and Hammond point. I have no doubt the cultural hurdles will be harder to vault than the technological ones. But I have loved my work the best when I’ve been at the table with the developers from inception to deployment, and seen how well that works, so I’ll take heart from the positive response to this first small step. I’ll keep you posted.

Swept away at Surge 3

Posted by Peter Burkholder Tue, 05 Oct 2010 01:57:00 GMT

Last week I attended Surge up in Baltimore, and for the first time in over a decade I had the exhilarating sensation of being out of my depth. Since it was a small conference with some of the big players in web-scale applications and content delivery, there’s good reason for that, but it inspires me to pick up the pace in following some technologies I’ve only been tangentially aware of.

The most interesting talks were:
  • Scalable Design Patterns, Theo Schlossnagle
  • Enterprise solutions from commodity components, Brian Cantrill
  • Growth at Gilt Group, Geir Magnusson, Jr.
  • Why some architects almost never shard their applications, Baron Schwartz

I should find more to say about them, but the primary reason I haven’t been blogging much that last few months is that I try to take on too much writing too late in the evening, then run out of steam—so just take my recommendations for what they are.

All of the presentations are supposed to posted within a few weeks. I need to go back and watch a few that I missed. To wit, Artur Bergman on scaling Wikia, Justin Sheehy on Concurrency, Benjamin Black on Netflow (if time) and Neil Gunther on Quantifying Scalability.

So, you wanna be a consultant 5

Posted by Peter Burkholder Tue, 16 Mar 2010 01:28:00 GMT

On Wednesday, March 10, Sweth Chandramouli spoke on being an independent technology consultant at the DC-Sage bi-monthly meeting. While the whole talk was worthwhile, what stood out for me were his distilled principles on how to build and find business. While he claims that he’s borrowed freely (and largely from Malcolm Gladwell), I’ve not before seen these principles applied to consulting. I think they’re particularly useful since sysadmin types (myself included) can be shy about promoting themselves, but it’s easier to overcome shyness if you know you are applying ‘correct’ methodology.

Ask for the behavior you want.

Example: Suppose I’m asked what line of work I’m in. We all know the wrong answer is to say, “I do web engineering,” and leave it at that. It’s also wrong to answer, “I do web eningeering. Do you know anyone needing to scale out their web presence, or fix an inadequate web infrastructure?” because “know” is a passive state and asks nothing of your questioner. Contrast with, “I do web engineering. Say, can you take a moment and tell me who you know that makes a living off running a website, and may need help scaling it out, or fixing a broken infrastructure?”

Now I’ve asked for a concrete action, a behavior, which not only may elict an immediate response, but is more likely to be recalled later when it really matters (as when my listener is talking to her CTO a week later).

(On an related note, Sweth pointed out that asking for business cards is more effective, and cheaper, than distributing them unsolicited)

Reward the behavior (not the outcome) that you desire

Be prolific with feedback and thanks for behaviors that may benefit you, even if the outcome of any specific behavior does not pan out. Sweth’s example was to send thank yous for any referral, not just the ones the bring in business. We also discussed the impact of hand-written thank-yous in an age of email.

So, the next day I emailed Sweth to get his snail mail address so I could send him a thank you card. In his response, he noted that I’m more likely to get DC-SAGE speakers to show up for return gigs if I send the thank you upon acceptance (rewarding the behavior) instead waiting for the talk to go through (rewarding the outcome). At which point I sent a thank-you both to Sweth and to John Nicholson), the May speaker on Law and Technology.

Lead with the giving hand

Sweth’s third principle was to give something of value to potential clients, but not to give away any services themselves. His example were the free “Site Evaluations” that some companies do, but he suggested we find ways that are not such naked sales pitches.

For me, this was a wake-up call to return to a practice I’d made for some years of offering, nay, insisting on giving a ‘Job Talk’ when called in for a job interview. I started doing that after graduate school because I was used to teaching/research candidates coming on site and making presentations, and when I was on the job market (particularly after the Tech Bubble) I thought it was a way to stand out from the crowd. But I also liked giving job talks because it gave me something to do in preparation for the interview.

In retrospect, the practice served me doubly well because giving a talk on some new technology provided something of tangible value to my potential employer, and built a relationship where the job offer was the logical step in reciprocation. Score.

Apache redirects with RewriteMap and RewriteCond 21

Posted by Peter Burkholder Wed, 15 Jul 2009 01:13:00 GMT

At work we’re migrating several thousand research articles from Zope to another CMS. The CMS folks are taking care of moving the content, but when we’re done we’re going to institute a boatload of R=301 redirects from the old URLs to the new URLs.

RewriteMap is the accepted way with mod_rewrite of handling a lot of one-to-one mappings that don’t follow any particular pattern. What I figured out today was that I can use a rewrite map in a RewriteCond statement so I only do the redirect when there’s a match in the rewrite map lookup.

Here are some snippets from my httpd.conf to illustrate:

For testing, we’ll want logging:


RewriteLog /var/log/httpd/rewrite.log
RewriteLogLevel 2

Define the map we’re using. The content of the map is ‘old_uri new_uri’ with a space separating the two. Use .txt for testing, and the below we’ll convert to a DBM.


RewriteMap research_map txt:/etc/httpd/conf/research_map.txt 

Here, I got a hint from http://www.tunnell.org/blog_posts_view.php?blog_postid=3. Remember that the syntax for a RewriteCond is:

RewriteCond TestString ConditionPattern

TestString will be a map lookup of $1, where $1 is the match string of the following RewriteRule, expressed: ${research_map:$1}

For ConditionPattern, we will test if the TestString is lexically greater than ””, the empty string, which is what the map lookup returns when there’s no match. Expressed: >""


RewriteCond ${research_map:$1}  >""     # IE, if map result is greater than "" 

So if the URL starts with /research, then use the research_map value for the key $1 to redirect to new address


RewriteRule ^(/research/.*$) ${research_map:$1}      [R=301,L]

What we end up with is a few lines of configuration that quickly let me put in place 3342 new redirects. Here’s the whole stanza:



RewriteMap research_map txt:/etc/httpd/conf/research_map.txt 
RewriteCond ${research_map:$1}  >""     # IE, if map result is greater than "" 
RewriteRule ^(/research/.*$) ${research_map:$1}      [R=301,L]

# Same thing, but lookup with a trailing slash if there isn't one
RewriteCond ${research_map:$1/}  >"" 
RewriteRule ^(/research/.*[^/]$) ${research_map:$1/}      [R=301,L]

Lastly, converting the textfile to a dbm speeds up the lookup by at least an order of magnitude.

bash command timeout 20

Posted by Peter Burkholder Mon, 22 Jun 2009 18:51:00 GMT

At $WORK I’m needing to maintain a backup system wherein our backup server a) starts an SSH process to stop-and-dump our CMS service, then b) SCPs the dumpfile back to the backup servers for writing to tape. I’ve discovered that the stop-and-dump part of the process would hang for 24 hours* when the stop-and-dump perl script exited but the initiating OpenSSH sshd process would not exit, preventing the SCP process from going forward.

I’ve decided to put a command timeout on the SSH process, and here’s how it looks in bash:

# Inspired by:
# http://www.ultranetsolutions.com/BASH-terminate-command-after-timeout.html
cmd_timeout() {
   [ $# -eq 2 ] || die "cmd_timeout takes 2 arguments" 
   command=$1
   sleep_time=$2

   # run $command in background, sleep for our timeout then kill the process if it is running
   # $! has the pid of the backgrounded job
   $command &
   cmd_pid=$!

   # sleep for our timeout then kill the process if it is running
   ( sleep $sleep_time && kill $cmd_pid && echo "ERROR - killed $command due to timeout $sleep_time exceeded" ) &
   killer_pid=$!

   # 'wait' for cmd_pid to complete normally.  If it does before the timeout is reached, then
   # the status will be zero.  If the killer_pid terminates it, then it will have a non-zero 
   # exit status
   wait $cmd_pid &> /dev/null
   wait_status=$?

   if [ $wait_status -ne 0 ]; then 
      echo "WARNING - command, $command, unclean exit" 
   else
      # Normal exit, detach and clean up the useless killer_pid
      disown $killer_pid
      kill $killer_pid &> /dev/null
   fi

   return $wait_status
}

cmd_timeout "ssh myhost some_long-running_command" 
next_command
* but I ought to raise this on an openssh mailing list in case it’s a bug, but anyho…

Building a VirtualBox server lab on OsX with NAT and Internal Networking (intnet) 29

Posted by Peter Burkholder Wed, 27 May 2009 00:13:00 GMT

Chef and Puppet, Take 2

A few weeks ago I announced my intent to compare Puppet to Chef, then quickly got bogged down with, among other things, needing to get my Typo installation up to snuff.

I’m going to take another shot at this, with the initial goal of getting Puppet and Chef running on CentOS 4.7 inside VirtualBox on a OsX 10.4 system.

VirtualBox

VirtualBox is a semi-free virtual environment from Sun. The open-source version seems suitable for server testing, but there’s also a more fully-featured proprietary version. I’m choosing it because I have more space on my $WORK system then my personal system, but I don’t have a VmWare Fusion license. Further, my home virtual Linux systems are all Ubuntu, and $WORK is RHEL, so there’s more point to this exercise using a RHEL-derived system if I’m going to get some traction on my plan to get a configuration-management system rolling at my workplace.

Creating a new VirtualBox

First I’m going to get two instances of Damn Small Linux. In VirtualBox

  • click “New”
  • select “Linux”, “Debian”, “256 Mb Ram”, “34 Mb hard drive”
  • create
  • in the system description, select CD/DVD-ROM and connect to the .iso image
    • in VirtualBox, you ‘Add’ isos from various places in your filesystem to a list of available ISOs, so add dsl-4.4.10-initrd.iso
  • Press ‘Start’ then you get an info window on which key to use to de-associate your keyboard from the GuestOS.
  • DSL (DamnSmallLinux) boots astonishingly fast
  • The process of cloning a system is not worth it for a diskless system. Just make DamnSmall2 following the same steps.

Out of the box, this created two hosts connected to the world via NAT, but both apparently on the same IP address (10.0.2.15). So far, so good.

What I want is:

  • All VMs to have NAT access to the world, and SSH access from the host operating system via port-forwarding.
  • All VMs to communicate with each other over an internal-only network at 192.168.5.0

It seems that the VirtualBox tutorials out there really muddy up the water. It’s not that hard.

NAT and port-forward ssh access

  • VirtualBox configuration
    • Add network adapter 1 as NAT
    • Run the following script

# Number all hosts up from 1
host=DamnSmall1
port=2201

# let eth0/adapter1 be DHCP NAT
VBoxManage setextradata "$host" \
    "VBoxInternal/Devices/pcnet/0/LUN#0/Config/guestssh/Protocol" TCP 
VBoxManage setextradata "$host" \
    "VBoxInternal/Devices/pcnet/0/LUN#0/Config/guestssh/GuestPort" 22 
VBoxManage setextradata "$host" \
    "VBoxInternal/Devices/pcnet/0/LUN#0/Config/guestssh/HostPort" $port
  • Guest Host Configuration—turns out the IP doesn’t matter ‘cause we use port-forwarding to reach the host
    • Debian /etc/network/interfaces
iface eth0 inet dhcp

Internal Networking on Static IPs

  • VirtualBox Configuration
    • Connect Adapter 2 to ‘intnet’ internal network
  • Guest Host configuration: Set up eth1 static ip addresses internally, e.g.
    • Debian /etc/network/interfaces
iface eth0 inet static
    address 192.168.5.201
    netmask 255.255.255.0

Upgrading to Typo 5.3 (on Dreamhost) 50

Posted by Peter Burkholder Wed, 22 Apr 2009 02:20:00 GMT

Upgrading from Typo 4.0.3 to Typo 5.3 on Dreamhost Rail

I’ve been slacking lately on my blogging, which is a shame because I have much to write about. So much going on, no time to write about it. To help rectify that, I intend to start devoting Tuesday evenings to technology blogging, and to start things off, let’s upgrade this dumpy old Typo 4.0.3 to shiny new Typo 5.3.

I’m on Dreamhost, and they suck, but not bad enough for me to pick up and go elsewhere, so I guess that actually counts as good service. So here are one set of correct steps to upgrade a Typo 4.X installation to Typo 5.3. These are based on the Upgrading to Typo5.3 instructions, but differ at a few points for Dreamhost specifics, and to accomodate my own mistakes.

  • login to the Typo web interface and uninstall all your sidebars (see note below)
  • use mysqldump to dump the mysql database
    mysqldump -u typo -p -h mysql.typo.pburkholder.com typo > ~/backup/typo.mysql
  • use tar to copy the typo installation

tar -czf backup/typo.4.0.3.tgz ./typo.pburkholder.com/

  • install the ‘typo’ gem. I’ve been using ~/.rubygems as my local gem install, so export GEM_PATH=/usr/lib/ruby/gems:/home/pburkholder/.rubygems
    gem install -i ~/.rubygems typo
  • now do the install:

~/.rubygems/bin/typo install ~/typo.pburkholder.com

  • move some of my old files over (the upgrade instructions suggest copying over old themes, don’t)

mkdir ~/typo.pburkholder.com/public/files/
cp typo4.pburkholder.com/public/files/* typo.pburkholder.com/public/files/
cp typo4.pburkholder.com/config/database.yml typo.pburkholder.com/config/database.yml

  • run the database migration cd ~/typo.pburkholder.com
    rake RAILS_ENV=production db:migrate
    rake gems:refresh_specs
    (The last step may not be necessary, but worked for me)
  • your old theme won’t work. Period. Simplest solution I came up with was to symlink a 5.3 theme to my old theme name. E.g., I had ‘azure’ as my theme, and ‘scribbish’ came with 5.3, so:

cd ~/typo.pburkholder.com/themes; ln -s scribbish azure

  • start a single instance on port 3000 and see how things are on http://typo.pburkholder.com:3000

script/server -e production (Ctrl-C to kill)

  • set up Passenger. I had been running the blog using FastCGI back before Dreamhost had any real Rails support. Now they support Passenger, which is promising. I had been running Typo using FastCGI suppor and an .htaccess file that called dispatcher.fcgi, and it was a dog. So..
  • Go have fun!!

What if I skipped step 1?

I misread the upgrade instructions, and moved the sidebar code instead of ‘removing’ them from the web UI. My new Typo instance complained bitterly about missing sidebars. To remove the sidebars manually is just a matter of connecting the production MySQL database and seeing what’s in the sidebars, then removing them:


select * from sidebars\G
delete from sidebars where id>1;

Now I need to replace my old sidebar links, such as:

*My (out-of-date) resume *LinkedIn Profile