package PubMed::Record; use strict; use warnings; use utf8; use Data::Dumper; use Carp qw(confess); use PubMed::Dates; my $date_paths; $date_paths->[0]='/MedlineCitation/Article/ArticleDate'; $date_paths->[1]='/MedlineCitation/Article/Journal/JournalIssue/PubDate'; my $pmid_path='/MedlineCitation/PMID'; ## a counter to see how sucessful the paths are our $count_paths; foreach my $date_path (@$date_paths) { $count_paths->{$date_path}=0; } my $month_number=$PubMed::Dates::month_number; my $seasons_date=$PubMed::Dates::seasons_date; sub get_pmid { my $txt=shift; my $doc=shift; my $xpc = XML::LibXML::XPathContext->new($doc); my @nodes=$xpc->find($pmid_path)->get_nodelist; if(not scalar @nodes) { confess "I found no pmid in \n$txt."; } if(@nodes) { if(scalar(@nodes) > 1) { confess "I found several years in \n$txt;"; } } my $pmid=$nodes[0]->textContent; if(not $pmid=~m|^\d+$|) { confess "This '$pmid' is not a pmid"; } return $pmid; } sub get_date_part { my $txt=shift; my $doc=shift; my $type=shift; my $xpc = XML::LibXML::XPathContext->new($doc); my $year=0; my $part=''; foreach my $path (@{$date_paths}) { $part=&try_to_find_part($xpc,$path.'/'.$type,$txt) or next; if($part) { $count_paths->{$part}->{$path}++; last; } } if(not $part and $main::test) { print "no $part in\n$txt\n"; exit; } return $part; } sub find_date { my $txt=shift; my $doc=shift; my $year=&get_date_part($txt,$doc,'Year'); if(not $year=~m|^\d{4}$|) { return 0; } ## check the seasons my $season=&get_date_part($txt,$doc,'Season') // ''; if($season) { if($seasons_date->{$season}) { return "$year-".$seasons_date->{$season}; } confess "I don't know about the season '$season'."; } my $month=&get_date_part($txt,$doc,'Month'); if(not $month) { ## report the PubDate; my $pubdate_ele=$doc->getElementsByTagName('PubDate')->[0]; open(M,">> /tmp/monthless"); print M $pubdate_ele->toString(),"\n"; close M; return "$year-01-01"; } if($month=~m|^\d$|) { $month="0$month"; } if($month_number->{$month}) { $month=$month_number->{$month}; } if(not $month=~m|\d{2}|) { confess "month is $month in $txt"; return 0; } my $day=&get_date_part($txt,$doc,'Day'); if($day=~m|^\d$|) { $day="0$day"; } if(not $day) { $day='01'; } if(not $day=~m|\d{2}|) { return 0; } return "$year-$month-$day"; } sub find_medline_date { my $txt=shift; my $doc=shift; my $xpc = XML::LibXML::XPathContext->new($doc); my $medline_date_xp='/MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate'; my @nodes=$xpc->find($medline_date_xp)->get_nodelist; if(@nodes) { if(scalar(@nodes) > 1) { confess "I found several years in \n$txt;"; } my $to_parse=$nodes[0]->textContent; my $text=&parse_medline_date($to_parse); if(not $text) { print "My parse_medline_date could not parse '$to_parse'.\n"; } return $text; } return 0; } sub parse_medline_date { my $in=shift; my $year; my $month; my $day; if($in=~m|^(\d{4})$|) { my $year=$1; return "$year-01-01"; } if($in=~m|^(\d{4})-(\d{4})|) { my $year=$1; return "$year-01-01"; } if($in=~s|^(\d{4})\s+||) { $year=$1; } elsif($in=~m|^(\d{4})-(\d{4})$|) { return "$1-01-01"; } elsif($in=~m|^(\d{4})-(\d{4}) Winter$|) { return "$1-12-21"; } elsif($in=~m|^(\d{4}), (\d{4})|) { return "$1-01-01"; } elsif($in=~m|^(\d{4}), reçu|) { return "$1-01-01"; } elsif($in=~m|^(\d{4})-00 Winter$|) { return "$1-01-01"; } elsif($in=~m|^(\d{3})(\d)-(\d)$|) { my $year="$1$2"; if($2+1 == $3) { return "year-01-01"; } return ''; } elsif($in=~m|^(\d{4})-\d{2}$|) { return "$1-01-01"; } elsif($in=~m|^(\D+) (\d{4})$|) { my $year=$2; my $may_be_season=$1; if($seasons_date->{$may_be_season}) { return "$year-".$seasons_date->{$may_be_season}; } } elsif($in=~m|^(\d{2})(\d{2})-(\d{2})|) { $year="$1$2"; if($2 eq $3) { ## remove the redundant part $in=~s|\d{4}-(\d{2}) ||; } } else { warn "I warn you. I can't parse '$in'\n"; return ''; } ## scan after the year ## year repeat in two-digits $in=~s|;$||; $in=~s|Early\s*||; $in=~s|Contd\s+||; $in=~s|\s*\(II\)\s*||; if($in=~m|^1st Quart|) { return "$year-01-01"; } if($in=~m|^2n?d Quart|) { return "$year-04-01"; } if($in=~m|^Second Quarter|) { return "$year-04-01"; } if($in=~m|^3r?d Quart|) { return "$year-06-01"; } if($in=~m|^Third Quart|) { return "$year-06-01"; } if($in=~m|^4th Quart|) { return "$year-09-01"; } if($in=~m|^1st Trimest|) { return "$year-01-01"; } if($in=~m|^2n?d Trimest|) { return "$year-04-01"; } if($in=~m|^3r?d Trimest|) { return "$year-06-01"; } if($in=~m|^4th Trimest|) { return "$year-09-01"; } if($in=~m|^EASTER|i) { return "$year-04-07"; } if($in=~m|^Christmas|i) { return "$year-12-25"; } ## parts, we can't make a guess if($in=~m|^p \d+|i) { return "$year-01-01"; } if($in=~m| p |i) { return "$year-01-01"; } if($in=~m|Part|i) { return "$year-01-01"; } if($in=~m|Suppl|i) { return "$year-01-01"; } if($in=~m|^p \d+|i) { return "$year-01-01"; } if($in=~m|^1/2|i) { return "$year-01-01"; } if($in=~m|^1-2|i) { return "$year-02-01"; } if($in=~m|^Summber|i) { return "$year-06-22"; } if($in=~m|^Summer/Fall|i) { return "$year-06-22"; } if($in=~m|^3/4|i) { return "$year-06-01"; } my $start=substr($in,0,3); if($month_number->{$start}) { return "$year-".$month_number->{$start}."-01"; } if($in=~m|^(\d)$|) { return "$year-0$1-01"; } ## year + two digits if($in=~m|^(\d+)$|) { my $may_be_month=$1; if($may_be_month<13) { return "$year-$may_be_month-01"; } return "$year-01-01"; } ## year + (date enacted) if($in=~m|\(date enacted\)|) { return "$year-01-01"; } ## year + (date approved) if($in=~m|\(date approved\)|) { return "$year-01-01"; } ## year + semester if($in=~m|1st Semest|) { return "$year-01-01"; } if($in=~m|2rd Semest|) { return "$year-06-01"; } ## year season if($seasons_date->{$in}) { return "$year-".$seasons_date->{$in}; } ## year season if($month_number->{$in}) { return "$year-".$month_number->{$in}.'-01'; } if($in=~m|^(\d{1,2})th (\w{3})$|) { my $may_be_day=$1; my $may_be_month=$2; if($may_be_day>32) { return ''; } my $month=$month_number->{$may_be_month} // return ''; if($may_be_day<10) { return $year.'-'.$month.'-0'.$may_be_day; } else { return $year.'-'.$month.'-'.$may_be_day; } } if($in=~m|^(\d{1,2}) (\w{3})$|) { my $may_be_day=$1; my $may_be_month=$2; if($may_be_day>32) { return ''; } my $month=$month_number->{$may_be_month} // return ''; if($may_be_day<10) { return $year.'-'.$month.'-0'.$may_be_day; } else { return $year.'-'.$month.'-'.$may_be_day; } } ## big numbers, don't know what to do if($in=~m|S?\d{2,}-S?\d{2,}|) { return $year.'-01-01'; } ## just a guess if($in=~m|^9-10$|) { return $year.'-09-01'; } ## year + season + season if($in=~m|^([A-Z][a-z]+)-([A-Z][a-z]+)$|) { my $may_be_season_1=$1; my $may_be_season_2=$2; if($seasons_date->{$may_be_season_1} and $seasons_date->{$may_be_season_2}) { return "$year-".$seasons_date->{$may_be_season_1}; } } ## year + season and season if($in=~m|^([A-Z][a-z]+) and ([A-Z][a-z]+)$|) { my $may_be_season_1=$1; my $may_be_season_2=$2; if($seasons_date->{$may_be_season_1} and $seasons_date->{$may_be_season_2}) { return "$year-".$seasons_date->{$may_be_season_1}; } } ## year + season - year, usually followed by a season, so not anchored at the end if($in=~m|^([A-Z][a-z]+)[- ]+(\d{4})|) { my $may_be_season=$1; if($seasons_date->{$may_be_season}) { return "$year-".$seasons_date->{$may_be_season}; } } ## ' 5th Jan 2017' if($in=~m|^\s*(\d+)th\s+(\S{3})\s+(\d{4})\s*$|) { my $day=$1; my $month_short_name=$2; my $month=&month_short_name_to_number($month_short_name); my $year=$3; my $result="$year-$month-$day"; return $result; } ## month1-month2 if($in=~m|^\s*([A-Z][a-z][a-z])-[A-Z][a-z][a-z]\s*$|) { my $month_short_name=$1; my $month=&month_short_name_to_number($month_short_name); my $result="$year-$month-01"; return $result; } if($in eq 'Dev') { my $result="$year-12-01"; return $result; } if($in eq 'Desember') { my $result="$year-12-01"; return $result; } if($in eq 'Dic') { my $result="$year-12-01"; return $result; } if($in eq '"Apr') { my $result="$year-04-01"; return $result; } ## I give up. my $return="$year-01-01"; warn "I warn you. I return the default $return for '$in'."; return $return; } sub month_short_name_to_number { my $in=shift; if($in eq 'Jan') { return '01'; } if($in eq 'Feb') { return '02'; } if($in eq 'Mar') { return '03'; } if($in eq 'Apr') { return '04'; } if($in eq 'May') { return '05'; } if($in eq 'Jun') { return '06'; } if($in eq 'Jul') { return '07'; } if($in eq 'Aug') { return '08'; } if($in eq 'Sep') { return '09'; } if($in eq 'Oct') { return '10'; } if($in eq 'Nov') { return '11'; } if($in eq 'Dec') { return '12'; } } sub try_to_find_part { my $xpc=shift; my $path=shift; my $txt=shift; my @nodes=$xpc->find($path)->get_nodelist; if(@nodes) { if(scalar(@nodes) > 1) { confess "I found several years in \n$txt;"; } my $text=$nodes[0]->textContent; return $text; } return ''; } 1;