package PubMed::Bada; use strict; use warnings; use Carp qw(confess); use Data::Dumper; use File::Copy; use PubMed::Common; use PubMed::Dates; use PubMed::Search; use PubMed::Paths; my $bada_dir=$PubMed::Paths::bada_dir; sub search_in_bada { my $pmid=shift // confess "I need a pmid here to search for."; my $test=shift // ''; my $pemi=&PubMed::Common::pmid_to_pemi($pmid); my $bada_file="$bada_dir/$pemi.sou"; my $pmid_date=&PubMed::Search::search($bada_file,$pmid); if(not $pmid_date) { ## search with a grep $pmid_date=`$PubMed::Paths::grep \'^$pmid \' $bada_file`; chomp $pmid_date; if($pmid_date) { print "I found the pmid $pmid in $bada_file only after a grep.\n"; } } if(not $pmid_date) { return ''; } $pmid_date=~s|$pmid\s+|| or confess "I have a bad search result."; return $pmid_date; } sub build_raw { my $in_dir=shift // "I need an in_dir here."; my $to_do=shift // 0; my $test=shift // 1; my $today; my $do_days; ## do a certain number of days? if(not ref($to_do)) { $today=&PubMed::Dates::today; $do_days=$to_do; } foreach my $dain_file (`ls $in_dir`) { chomp $dain_file; ## recurse if this is a year if($dain_file=~m|^\d{4}$|) { &build_raw("$in_dir/$dain_file",$to_do,$test); next; } if($test) { print "dain_file is $dain_file\n"; } if(not $dain_file=~m|^\d{4}-\d{2}-\d{2}$|) { if($test) { print "I am skipping non-dain file $dain_file.\n"; } next; } ## it's really the same as the file, but ok my $date=$dain_file; ## check the file is not for an old date my $dain_fufi="$in_dir/$dain_file"; if($to_do) { ## a hash of files to do if(ref($to_do)) { if(not defined($to_do->{$dain_fufi})) { if($test) { print "I don't reindex $dain_fufi\n"; next; } } else { print "I index $dain_fufi\n"; } ## to_do can be a number } elsif($to_do) { if(-M $dain_fufi > $do_days) { if(&PubMed::Dates::diff_dates($date,$today) > $do_days) { if($test) { print "I skip $date, it is more than $do_days days ago.\n"; } next; } } } elsif($test) { print "$dain_file is recent\n"; } } my $dain_fh; open($dain_fh, '<:encoding(UTF-8)', $dain_fufi) or die "I can't open $dain_fufi."; my $line; while($line=<$dain_fh>) { chomp $line; if(not $line=~m|^(\d+)|) { if($test) { print "I skip line '$line'\n"; } next; } my $pmid=$1; my $pemi=&PubMed::Common::pmid_to_pemi($pmid); if($test) { # print "$pemi $pmid\n"; } my $bada_fufi="$bada_dir/$pemi.raw"; my $bada_fh; open($bada_fh, '>>:encoding(UTF-8)', $bada_fufi) or die "I can't open $bada_fufi."; print $bada_fh "$pmid $date\n"; close $bada_fh; } } } sub sou_from_bara { my $test=shift // ''; foreach my $bara_file (glob("$bada_dir/*.raw")) { my $sou_file=$bara_file; my $son_file=$bara_file; $sou_file=~s|\.raw$|.sou| or die; $son_file=~s|\.raw$|.son| or die; if(-f $sou_file and ((not -z $sou_file) and (-M $sou_file < -M $bara_file))) { if($test) { print "I don't need to renew $sou_file\n"; } next; } if($test) { print "I need to renew $sou_file\n"; } my $s="export LC_ALL=C; sort $bara_file | uniq > $son_file"; system($s); my $sou_fh; my $son_fh; open($son_fh, '<:encoding(UTF-8)', $son_file) or die "I can't open $son_file."; open($sou_fh, '>:encoding(UTF-8)', $sou_file) or die "I can't open $sou_file."; my $line; my $old_pmid=0; while($line=<$son_fh>) { chomp $line; if(not $line=~m|^(\d+) \d{4}-\d{2}-\d{2}|) { if($test) { print "I can not parse line '$line'\n"; } next; } my $pmid=$1; if($pmid and $pmid == $old_pmid) { next; } $old_pmid=$pmid; print $sou_fh "$line\n"; } close $sou_fh; close $son_fh; unlink $son_file; } } sub replace_raw_with_sou { my $test=shift // ''; my $count=0; while($count < 1000) { while(length($count)<3) { $count="0$count"; } my $raw_file="$bada_dir/$count.raw"; if(not -f $raw_file) { if($test) { print "I can't see $raw_file\n"; } $count++; next; } my $sou_file="$bada_dir/$count.sou"; if(not -f $sou_file) { if($test) { print "I can't see $sou_file\n"; } $count++; next; } if(-s $sou_file == -s $raw_file) { if($test) { print "$sou_file and $raw_file already have the same size\n"; } $count++; next; } my ($read_time, $write_time) = (stat($raw_file))[8,9]; copy($sou_file,$raw_file); utime($read_time, $write_time, $raw_file); $count++; } } 1;