package PubMed::Index; use strict; use warnings; use Carp qw(confess); use Data::Dumper; use File::Compare; use XML::LibXML; #use PubMed::Api; use PubMed::Common; use PubMed::Paths; use PubMed::Search; use PubMed::Strings; my $barf_dir=$PubMed::Paths::barf_dir; sub parse_stalefi_line { my $line=shift // confess "I need a line here."; my @comps=split(' ',$line); my $start=$comps[1]; my $length=$comps[2]; my $infi=$comps[3]; return ($start,$length,$infi); } sub read_stalefi { my ($start,$length,$infi,$fhs)=@_; if(not defined($infi)) { confess "I need an infi."; } my $rawx_dir="$PubMed::Paths::rawx_dir"; my $rawx_file="$rawx_dir/pubmed$infi.xml"; if(not defined($fhs->{$infi})) { open($fhs->{$infi},"< $rawx_file") or confess "I can not open the file $rawx_file"; } seek($fhs->{$infi},$start,0); my $text; read($fhs->{$infi},$text,$length); return $text; } sub get_latest_loc { my $line=shift; my @parts=split(' ',$line); my $loc=join(' ',@parts[0..3]); return $loc; } sub get_purd_by_pmid { my $pmid=shift; my $loc=&check_purd_by_pmid($pmid) or return ''; if($loc=~m|^\d+$|) { confess "loc $loc is a pmid."; } my @stalefi=&PubMed::Index::parse_stalefi_line($loc); if(not scalar(@stalefi) == 3) { die Dumper @stalefi; } return &PubMed::Index::read_stalefi(@stalefi); } sub check_purd_by_pmid { my $pmid=shift // confess "I need a pmid here."; my $pemi=&PubMed::Common::pmid_to_pemi($pmid); my $pemi_file="$barf_dir/$pemi.sox"; if(not -f $pemi_file) { confess "I don't see $pemi_file"; } #my $s="$PubMed::Paths::look -b $pmid $pemi_file"; #my $loc=`$s`; #chomp $loc; my $loc=&PubMed::Search::search($pemi_file,$pmid); if(not $loc) { return ''; } return $loc; } #sub get_purd_by_line { # my $line=shift; ## fapi case #if($line=~m|^(\d+)$|) { # my $pmid=$1; # return &PubMed::Api::get_purd_by_pmid($pmid) // ''; #} #if($line=~m-^\d+ \d+ \d+ \d{2}n\d{4}(\s|$)-) { # my @stalefi=&PubMed::Index::parse_stalefi_line($line); # return &PubMed::Index::read_stalefi(@stalefi); #} ## in the case of pmid + number, seed 2016-01-08 #if($line=~m|^(\d+) |) { # my $pmid=$1; # warn "You gave me a bad line '$line', but I try pmid $pmid"; # return &PubMed::Index::get_purd_by_pmid($pmid) // ''; #} #confess "You gave me a very bad line '$line'"; #} sub get_pmid_from_loc { my $loc=shift; my $index=index($loc,' '); my $pmid=substr($loc,0,$index) // ''; if(not $pmid) { confess "I can't get the pmid from '$loc'"; } return $pmid; } sub get_earliest_infi { my $loc=shift; my @parts=split(/ /,$loc); my $infi=$parts[$#parts]; if(not $infi=~m|^\d{2}n\d{4}$|) { confess "I found a bad infi $infi"; } return $infi; } 1;