#!/usr/bin/perl -w # # Locate lojban text from a file containing some lines which # are lojban and some which are not. # # (there's hacks in here to make it irc-smart) # use strict; use warnings; use Data::Dumper; ############################################################################## if ($#ARGV != 1) { die "usage: $0 filename outfile"; } my $filename = $ARGV[0]; my $outfile = $ARGV[1]; open FILE, "<$filename" or die "open $filename: $!"; open OUTFILE, ">$outfile" or die "open $outfile: $!"; while () { my ($line, $theline, $score); # percent of words on a line which must be lojban $theline = $_; # trim the irc-style formatting out of this if (/\d\d:\d\d \<\s*[^ ]+\> (.*)$/) { $line = $1; } elsif (/\d\d:\d\d\s*\*\s*[^ ]+ (.*)$/) { $line = $1; } elsif (/-----.*/) { # keep log thingies. print OUTFILE; next; } else { die "unkown line; $_"; } $_=$line; s/^[[]\d\d:\d\d[]]//; s/^\d\d [A-Z][a-z][a-z] \d\d\d\d \d\d:\d\d:\d\d//; s/^ \** //; s/^ \<[^>]*\> //; s/^\<[^>]*\> //; s/^#[^>#]*\> //; s/^[^ ]* has changed the topic to "([^"]*)"/$1/; chomp; $line = $_; ## print "line: $line\n"; use IPC::Open2; my ($from_vlatai, $to_vlatai); my $pid = open2($from_vlatai, $to_vlatai, '/usr/bin/vlatai' ) or die "vlatai won't run\n"; my @words = split(/\s+/, $line); ## print "words: ".join("\n", @words)."\n"; foreach my $word (@words) { $word =~ s{[.]}{}g; } ## print "words: ".join("\n", @words)."\n"; next if( ! @words ); print $to_vlatai join("\n", @words)."\n"; close $to_vlatai; my @vlatai_says = <$from_vlatai>; close $from_vlatai; ## print "says: ".Dumper(\@vlatai_says)."\n"; my $good_count=grep(m{: (cmavo$s$|lujvo|gismu) :}, @vlatai_says); my $cmene_count=grep(m{: cmene :}, @vlatai_says); my $total_count=scalar @vlatai_says; next if($good_count <= 0); # Special cases for non-lojban quotes if( grep(m{^\s*la'oi\s*$}, @words) ) { $total_count -= 1; } if( grep(m{^\s*zo'oi\s*$}, @words) ) { $total_count -= 1; } if( grep(m{^\s*zoi\s*$}, @words) ) { $total_count -= 3; } if( grep(m{^\s*la'o\s*$}, @words) ) { $total_count -= 3; } next if($total_count <= 0); ## print "good_count: $good_count; cmene_count: $cmene_count; total_count: $total_count\n"; $score=((($good_count + ($cmene_count * 0.4)) / $total_count) * 100); my $needed = 80; # Special cases for very short sentences. if ($total_count == 2 ) { # In a two-word sentence, this matches 1 normal lojban # word and 1 cmene. $needed=60; } if ($total_count == 3) { # In a 3-word sentence, this is 2 # normal lojban words and one unmatched or cmene. $needed=66; } ## print "score: $score, needed: $needed\n"; if ( $score >= $needed) { ## print "PASSED! -- $line\n"; print OUTFILE "$theline"; } kill 1, $pid; waitpid( $pid, 0 ); } close FILE; close OUTFILE;