#!/usr/bin/env perl -w use Data::Dumper; my @work_files = qw/work1 work2 work3 work4/; my @notwork_files = qw/notwork1 notwork2 notwork3 notwork4 notwork5 notwork6/; my %work_tokens = (); my %notwork_tokens = (); foreach my $file (@work_files) { my %tokens = tokenize_file("training_set/" . $file); %work_tokens = combine_hash(\%work_tokens, \%tokens); } foreach my $file (@notwork_files) { my %tokens = tokenize_file("training_set/" . $file); %notwork_tokens = combine_hash(\%notwork_tokens, \%tokens); } my %total_tokens = combine_hash(\%work_tokens, \%notwork_tokens); my %test_tokens = tokenize_file($ARGV[0]); my $total_work_files = scalar(@work_files); my $total_notwork_files = scalar(@notwork_files); my $total_files = $total_work_files + $total_notwork_files; my $probability_work = $total_work_files / $total_files; my $probability_notwork = $total_notwork_files / $total_files; my $work_accumulator = 1; my $notwork_accumulator = 1; my $total_tokens = scalar(keys(%test_tokens)); foreach my $token (keys(%test_tokens)) { if (exists($total_tokens{$token})) { my $p_t_w = (($work_tokens{$token} || 0) + 1) / ($total_work_files + $total_tokens); $work_accumulator = $work_accumulator * $p_t_w; my $p_t_nw = (($notwork_tokens{$token} || 0) + 1) / ($total_notwork_files + $total_tokens); $notwork_accumulator = $notwork_accumulator * $p_t_nw; } } my $score_work = bayes( $probability_work, $total_tokens, $work_accumulator); my $score_notwork = bayes( $probability_notwork, $total_tokens, $notwork_accumulator); my $likelihood_work = $score_work / ($score_work + $score_notwork); my $likelihood_notwork = $score_notwork / ($score_work + $score_notwork); printf("likelihood of work email: %0.2f %%\n", ($likelihood_work * 100)); printf("likelihood of notwork email: %0.2f %%\n", ($likelihood_notwork * 100)); sub bayes { my ($p_w, $p_t, $p_t_w) = @_; my $p_w_t = ($p_t_w * $p_w) / $p_t; return $p_w_t; } sub combine_hash { my ($hash1, $hash2) = @_; my %resulthash = %{ $hash1 }; foreach my $key (keys(%{ $hash2 })) { if ($resulthash{$key}) { $resulthash{$key} += $hash2->{$key}; } else { $resulthash{$key} = $hash2->{$key}; } } return %resulthash; } sub tokenize { my $contents = shift; my %tokens = map { $_ => 1 } split(/\s+/, $contents); return %tokens; } sub tokenize_file { my $filename = shift; my $contents = ''; open(FILE, $filename); read(FILE, $contents, -s FILE); close(FILE); return tokenize($contents); }