package PubMed::Bada;

use strict;
use warnings;

use Carp qw(confess);
use Data::Dumper;
use File::Copy;

use PubMed::Common;
use PubMed::Dates;
use PubMed::Search;
use PubMed::Paths;

my $bada_dir=$PubMed::Paths::bada_dir;

sub search_in_bada {
  my $pmid=shift // confess "I need a pmid here to search for.";
  my $test=shift // '';
  my $pemi=&PubMed::Common::pmid_to_pemi($pmid);
  my $bada_file="$bada_dir/$pemi.sou";
  my $pmid_date=&PubMed::Search::search($bada_file,$pmid);
  if(not $pmid_date) {
    ## search with a grep
    $pmid_date=`$PubMed::Paths::grep \'^$pmid \' $bada_file`;
    chomp $pmid_date;
    if($pmid_date) {
      print "I found the pmid $pmid in $bada_file only after a grep.\n";
    }
  }
  if(not $pmid_date) {
    return '';
  }
  $pmid_date=~s|$pmid\s+|| or confess "I have a bad search result.";
  return $pmid_date;
}

sub build_raw {
  my $in_dir=shift // "I need an in_dir here.";
  my $to_do=shift // 0;
  my $test=shift // 1;
  my $today;
  my $do_days;
  ## do a certain number of days?
  if(not ref($to_do)) {
    $today=&PubMed::Dates::today;
    $do_days=$to_do;
  }
  foreach my $dain_file (`ls $in_dir`) {
    chomp $dain_file;
    ## recurse if this is a year
    if($dain_file=~m|^\d{4}$|) {
      &build_raw("$in_dir/$dain_file",$to_do,$test);
      next;
    }
    if($test) {
      print "dain_file is $dain_file\n";
    }
    if(not $dain_file=~m|^\d{4}-\d{2}-\d{2}$|) {
      if($test) {
	print "I am skipping non-dain file $dain_file.\n";
      }
      next;
    }
    ## it's really the same as the file, but ok
    my $date=$dain_file;
    ## check the file is not for an old date
    my $dain_fufi="$in_dir/$dain_file";
    if($to_do) {
      ## a hash of files to do
      if(ref($to_do)) {
	if(not defined($to_do->{$dain_fufi})) {
	  if($test) {
	    print "I don't reindex $dain_fufi\n";
	    next;
	  }
	}
	else {
	  print "I index $dain_fufi\n";
	}
	## to_do can be a number
      }
      elsif($to_do) {
	if(-M $dain_fufi > $do_days) {
	  if(&PubMed::Dates::diff_dates($date,$today) > $do_days) {
	    if($test) {
	      print "I skip $date, it is more than $do_days days ago.\n";
	    }
	    next;
	  }
	}
      }
      elsif($test) {
	print "$dain_file is recent\n";
      }
    }
    my $dain_fh;
    open($dain_fh, '<:encoding(UTF-8)', $dain_fufi) or die "I can't open $dain_fufi.";
    my $line;
    while($line=<$dain_fh>) {
      chomp $line;
      if(not $line=~m|^(\d+)|) {
	if($test) {
	  print "I skip line '$line'\n";
	}
	next;
      }
      my $pmid=$1;
      my $pemi=&PubMed::Common::pmid_to_pemi($pmid);
      if($test) {
	# print "$pemi $pmid\n";
      }
      my $bada_fufi="$bada_dir/$pemi.raw";
      my $bada_fh;
      open($bada_fh, '>>:encoding(UTF-8)', $bada_fufi) or die "I can't open $bada_fufi.";
      print $bada_fh "$pmid $date\n";
      close $bada_fh;
    }
  }
}

sub sou_from_bara {
  my $test=shift // '';
  foreach my $bara_file (glob("$bada_dir/*.raw")) {
    my $sou_file=$bara_file;
    my $son_file=$bara_file;
    $sou_file=~s|\.raw$|.sou| or die;
    $son_file=~s|\.raw$|.son| or die;
    if(-f $sou_file and ((not -z $sou_file) and (-M $sou_file < -M $bara_file))) {
      if($test) {
        print "I don't need to renew $sou_file\n";
      }
      next;
    }
    if($test) {
      print "I need to renew $sou_file\n";
    }
    my $s="export LC_ALL=C; sort $bara_file | uniq > $son_file";
    system($s);
    my $sou_fh;
    my $son_fh;
    open($son_fh, '<:encoding(UTF-8)', $son_file) or die "I can't open $son_file.";
    open($sou_fh, '>:encoding(UTF-8)', $sou_file) or die "I can't open $sou_file.";
    my $line;
    my $old_pmid=0;
    while($line=<$son_fh>) {
      chomp $line;
      if(not $line=~m|^(\d+) \d{4}-\d{2}-\d{2}|) {
	if($test) {
	  print  "I can not parse line '$line'\n";
	}
	next;
      }
      my $pmid=$1;
      if($pmid and $pmid == $old_pmid) {
	next;
      }
      $old_pmid=$pmid;
      print $sou_fh "$line\n";
    }
    close $sou_fh;
    close $son_fh;
    unlink $son_file;
  }
}

sub replace_raw_with_sou {
  my $test=shift // '';
  my $count=0;
  while($count < 1000) {
    while(length($count)<3) {
      $count="0$count";
    }
    my $raw_file="$bada_dir/$count.raw";
    if(not -f $raw_file) {
      if($test) {
	print "I can't see $raw_file\n";
      }
      $count++;
      next;
    }
    my $sou_file="$bada_dir/$count.sou";
    if(not -f $sou_file) {
      if($test) {
	print "I can't see $sou_file\n";
      }
      $count++;
      next;
    }
    if(-s $sou_file == -s $raw_file) {
      if($test) {
	print "$sou_file and $raw_file already have the same size\n";
      }
      $count++;
      next;
    }
    my ($read_time, $write_time) = (stat($raw_file))[8,9];
    copy($sou_file,$raw_file);
    utime($read_time, $write_time, $raw_file);
    $count++;
  }
}

1;