#!/usr/bin/perl -w
#
# Locate lojban text from a file containing some lines which
# are lojban and some which are not.
#
# (there's hacks in here to make it irc-smart)
#
use strict;
use warnings;
use Data::Dumper;

##############################################################################

if ($#ARGV != 1) {
  die "usage: $0 filename outfile";
}
my $filename = $ARGV[0];
my $outfile = $ARGV[1];

open FILE, "<$filename"
  or die "open $filename: $!";
open OUTFILE, ">$outfile"
  or die "open $outfile: $!";
while (<FILE>) {
  my ($line, $theline, $score);

  # percent of words on a line which must be lojban

  $theline = $_;

  # trim the irc-style formatting out of this
	if (/\d\d:\d\d \<\s*[^ ]+\> (.*)$/) {
		$line = $1;
	} elsif (/\d\d:\d\d\s*\*\s*[^ ]+ (.*)$/) {
		$line = $1;
	} elsif (/-----.*/) {
		# keep log thingies.
		print OUTFILE;
		next;
	} else {
		die "unkown line; $_";
	}
        $_=$line;
  s/^[[]\d\d:\d\d[]]//;
  s/^\d\d [A-Z][a-z][a-z] \d\d\d\d \d\d:\d\d:\d\d//;
  s/^ \** //;
  s/^ \<[^>]*\> //;
  s/^\<[^>]*\> //;
  s/^#[^>#]*\> //;
  s/^[^ ]* has changed the topic to "([^"]*)"/$1/;
  chomp;
  $line = $_;

  ## print "line: $line\n";

  use IPC::Open2;

  my ($from_vlatai, $to_vlatai);
  my $pid = open2($from_vlatai, $to_vlatai, '/usr/bin/vlatai' ) or die "vlatai won't run\n";

  my @words = split(/\s+/, $line);

  ## print "words: ".join("\n", @words)."\n";

  foreach my $word (@words) {
    $word =~ s{[.]}{}g;
  }

  ## print "words: ".join("\n", @words)."\n";

  next if( ! @words );

  print $to_vlatai join("\n", @words)."\n";
  close $to_vlatai;
  my @vlatai_says = <$from_vlatai>;
  close $from_vlatai;

  ## print "says: ".Dumper(\@vlatai_says)."\n";

  my $good_count=grep(m{: (cmavo\(s\)|lujvo|gismu) :}, @vlatai_says);
  my $cmene_count=grep(m{: cmene :}, @vlatai_says);
  my $total_count=scalar @vlatai_says;

  next if($good_count <= 0);

  # Special cases for non-lojban quotes
  if( grep(m{^\s*la'oi\s*$}, @words) ) {
    $total_count -= 1;
  }
  if( grep(m{^\s*zo'oi\s*$}, @words) ) {
    $total_count -= 1;
  }
  if( grep(m{^\s*zoi\s*$}, @words) ) {
    $total_count -= 3;
  }
  if( grep(m{^\s*la'o\s*$}, @words) ) {
    $total_count -= 3;
  }

  next if($total_count <= 0);

  ## print "good_count: $good_count; cmene_count: $cmene_count; total_count: $total_count\n";

  $score=((($good_count + ($cmene_count * 0.4)) / $total_count) * 100);

  my $needed		= 80;

  # Special cases for very short sentences.
  if ($total_count == 2 )
  {
    # In a two-word sentence, this matches 1 normal lojban
    # word and 1 cmene.  
    $needed=60;
  }
  if ($total_count == 3)
  {
    # In a 3-word sentence, this is 2
    # normal lojban words and one unmatched or cmene.
    $needed=66;
  }

  ## print "score: $score, needed: $needed\n";

  if ( $score >= $needed)
  {
    ## print "PASSED! -- $line\n";
    print OUTFILE "$theline";
  }

  kill 1, $pid;
  waitpid( $pid, 0 );
}
close FILE;
close OUTFILE;
