#!/usr/bin/env perl
# This file is part of the authorindex package for LaTeX + BibTeX.
# Version: 10. August 2008

use warnings 'all';
use strict;

# configuration: apart from the command to invoke perl above you might want to
# change:

my $bstenv="BSTINPUTS";	# Environment Variable holding .bst search path
my $bibenv="BIBINPUTS";	# Env Variable holding search path for BibTeX databases
my $tmp="_autidx_";	# Name base for temporary files
my $cwdpath=".";	# Directory where files are generated
my $pathsep=($^O eq "MSWin32") ? ';' : ":"; # Seperator between paths in a list

# This script takes LaTeX .aux files as input. It extracts all citations made
# with page number information. These, together with the bibliography data base
# extracted from the input and a .bst file are processed by bibtex to get a
# file that associates each citation label with corresponding authors. Together
# with the knowledge of which work is cited on which page, this is used to
# compute which author is cited on which page. This information is written to
# the output in form suitable to be included in a LaTeX document.
# Alternatively, the script can also extract the label that appears in the
# references for each work instead of the pages. Also, output for further
# processing by makeindex can be generated instead of a 'ready' LaTeX file.

# examine command line for options

my %opt;		# hash to contain options found
use Getopt::Std;
getopts('dhikpr',\%opt);

# handle -h option: a short description of the script.

exists $opt{'h'} && die <<END;
Usage: $0 [-d] [-h] [-i] [-k] [-p] [-r] [filename ...]

-d generate additional statistical information as comments
-i generate file (with extension .ain) suitable as input for makeindex
-k keep auxiliary bibliography style file $tmp.bst after script finishes
-h print this help and exit
-r  suppresses  automatic inclusion of .aux generated by \\include-ed files
-p send result to standard output

Any number of file names can be given. If none is given, input is read from
standard input. Otherwise, all files specified are read, after the extension
.aux has been appended to their name where necessary.
END

# Make sure that all input files get .aux extension.

grep(s/$/\.aux/,grep(!/\.aux$/,@ARGV));
my @SAVEARGV=@ARGV;

my $usenum=0;			# default: put page numbers into index
my $PageTypeOrder="rRnAa";	# default order of number types
my $altedit="'skip\$";		# no editors if no authors present
my $addedit="'skip\$";		# no editors if authors present

my $citationcount=0;	# used for statistics
my $workscount=0;
my $explicits=0;
my $authorcount=0;

my $miniindex=0;	# flag: is 1 if mini-index is requested
my $nocompress=0;	# flag: is 1 if page range compression is prohibited

my @bib;		# collect .bib-file names
my %biboccured;		# collect .bib-file names
my %pnlist;		# collect occuring pages/bib.numbers as keys of hash

my $editors=0;		# number: 0 no editors; 1 editors if no authors
			# 2: editors whenever present
my $nameformat="";		# string to format/sort names
my $maxnames=999;	# max. number of author names per entry
my $truncnames=999;	# how many authors to take if max number is exceeded
my $labeltype="labels";	# wether bibliography labels or pages are indexed

my %Lab2Pag;		# for each label gives array of pages where work cited
my %Lab2Num;		# for each label give its bibliography number
my %Lab2Bib;		# ditto, but for occurrence in bibliography
my %Aut2Pag;		# for each author gives array of pages cited/in bilio.
my %Aut2Bib;		# ditto, but only occurrence in bibliography
my %Aut2Lab;		# list of BibTeX entry labels an author occurs in
my %Aut2First;		# list of all 1st authors to which author was coauthor
my %LeadAutPag;		# author -> hash marking pages with work author is 1st

my %printname;		# sort-key -> printed representation of author
my %plainname;		# sort-key -> author name

my %PageTypeOrder;	# Page type code -> number giving relative order
my %PageOrder;		# page string -> array used to sort pages

my $see="";		# string to separate other and first author, or undef.
my $bst="";		# name of BibTeX program to extract author names
my $output="";		# name of file to which author index is written
my $twoabbrev="";	# string to append to page for 2 subsequent pages
my $also="";		# string to cross-refer to first authors
my $alsosep="";		# seperator for referenced first authors

my $useaibibcite;	# flag: watch for \aibibcite, but not for \bibcite.

# scan input files and
# - build the file later to be processed by BibTeX,
# - generate a temporary bibtex database of the explicit author names given,
# - assemble for each citation the page where it was referenced and
# - look for data base specification, output file name, and so on.

open(AUXFILE,">$tmp.aux") || die "Can't open temporary file $tmp.aux\n";
open(BIBFILE,">$tmp.bib") || die "Can't open temporary file $tmp.bib\n";

while(<>){
    if(/^\\citationpage\{\s*([^{ ]+)\s*\}\{(.+)\}$/){
	$citationcount++;	# used for statistics only.
	$workscount++ unless ($1 eq '*') or (exists $Lab2Pag{$1})
	  or (exists $Lab2Bib{$1}) or (exists $Lab2Num{$1});
	$pnlist{$2}="";
	push @{$Lab2Pag{$1}},$2;
	print AUXFILE "\\citation{$1}\n";
    }elsif(/^\\aibibcite\{([^{]+)\}\{(.+)\}$/){
	$workscount++ unless ($1 eq '*') or (exists $Lab2Pag{$1})
	  or (exists $Lab2Bib{$1}) or (exists $Lab2Num{$1});
	$pnlist{$2}="";
	push @{$Lab2Num{$1}},$2;
	print AUXFILE "\\citation{$1}\n" if $usenum;
	# using \aibibcite implies we want to ignore \bibcite.  For this to
	# work, in the .aux files, the first \aibibcite must appear before any
	# \bibcite.
	$useaibibcite="yes";
    }elsif(/^\\bibcite\{([^{]+)\}\{(.+)\}$/){
	unless($useaibibcite){
	    $workscount++ unless ($1 eq '*') or (exists $Lab2Pag{$1})
		or (exists $Lab2Bib{$1}) or (exists $Lab2Num{$1});
	    $pnlist{$2}="";
	    push @{$Lab2Num{$1}},$2;
	    print AUXFILE "\\citation{$1}\n" if $usenum;
	}
    }elsif(/^\\bibpage\{([^{]+)\}\{(.+)\}$/){
	$workscount++ unless (exists $Lab2Pag{$1})
	  or (exists $Lab2Bib{$1}) or (exists $Lab2Num{$1});
	$pnlist{$2}="";
	push @{$Lab2Bib{$1}},$2;
	print AUXFILE "\\citation{$1}\n";
    }elsif(/^\\aiexplicit\{(.+)\}\{(.+)\}$/ and !$usenum){
	# above: page number might not contain '}{'
	print BIBFILE "\@MISC{$tmp$explicits,author=\"$1\"}\n";
	$pnlist{$2}="";
	push @{$Lab2Pag{"$tmp$explicits"}},$2;
	print AUXFILE "\\citation{$tmp$explicits}\n";
	$explicits++;
    }elsif(/^\\bibdata\{(.+)\}$/){
	# keep the order of data base files, but remove duplicates;
	# BibTeX complains about them.
	my $bibdb;
	for $bibdb (split(",",$1)){
	    push @bib, $bibdb unless $biboccured{$bibdb};
	    $biboccured{$bibdb}=1;
	}
    }elsif(/^\\aistyle\{(.+)\}$/){
	warn "Multiple \\authorindexstyle\n" if $bst && ($bst ne $1);
	$bst=$1;
    }elsif(/^\\aioptions\{(.*)\}$/){
	($editors,$nameformat,$maxnames,$truncnames,$labeltype)=split /\|/,$1;
	$usenum|=($labeltype eq "labels");
	if($editors>0){
	    $altedit="{ peditor format }";
	    $addedit=$altedit if $editors==2;
	}
    }elsif(/^\\aifilename\{(.+)\}$/){
	warn "Warning: Multiple authorindices\n" if $output;
	$output=$1;
    }elsif(/^\\\@input\{(.+)\}$/){
	push(@ARGV,$1) unless exists $opt{'r'};
    }elsif(/^\\pagetypeorder\{([rRaAn]+)\}$/){
	$PageTypeOrder=$1;
    }elsif(/^\\aiseestring\{(.+)\}$/){
        $see=$1;
    }elsif(/^\\aialsostrings\{(.+)\}\{(.+)\}$/){
        $also=$1;
	$alsosep=$2;
    }elsif(/^\\aitwostring\{(.+)\}$/){
        $twoabbrev=$1;
    }elsif(/^\\aiinbibflag$/){
	$miniindex=1;
    }elsif(/^\\ainocompressflag$/){
	$nocompress=1;
    }
}

close BIBFILE;

# output can go to stdout or a filename found in the input files.

$output || die "You have to include .aux file produced by .tex file containing \\begin{document}\nin the argument list and you have to \\usepackage{authorindex}!\n";
$output="-" if exists $opt{'p'};

# We need at least one BibTeX database

push @bib, $tmp if($explicits);

my $bibfiles=join(",",@bib)
  || die "You must specify at least one BibTeX database\n";
print AUXFILE "\\bibdata{$bibfiles}\n";

$see && $also && die "\\aisee and \\aialso are mutually exlusive!\n";

# if the user hasn't explicitly given a .bst style for formatting author names,
# we generate our own based on the style options found in the input files.

unless($bst){
    my @nameformat=split /;/,$nameformat;
    my ($printkey,$namefmtcmd)=("cite\$ write\$ termline\n","");
    for (@nameformat){
	my ($namerep,$sortrep)=split /:/;
	$sortrep=$namerep unless $sortrep;
	$namefmtcmd.=
	  "duplicate\$ names swap\$ \"$namerep\" format.name\$ " .
	  "write\$ termline\n" .
	  "duplicate\$ names swap\$ \"$sortrep\" format.name\$ " .
	  "purify\$ \"u\" change.case\$ write\$ termline\n$printkey";
	$printkey="termline\n";
    }

    $ENV{$bstenv}="$cwdpath$pathsep" . (exists $ENV{$bstenv} ? $ENV{$bstenv} : "");
    $bst=$tmp;
    open(BSTFILE,">$bst.bst") || die "Can't open $bst.bst\n";
    print BSTFILE <<END;	# Now comes the BibTeX programm inlined...
% Temporary file generated by $0
entry{author editor authauthor autheditor}{}{}
strings{names} integers{numnames}
function{prefersecond}                  % return second argument if non-empty
{ duplicate\$ empty\$                   % return first argument otherwise
    'skip\$                             %'
    { swap\$ }
  if\$
  pop\$
}
function{pauthor}{ author authauthor prefersecond }
function{peditor}{ editor autheditor prefersecond }
function{termline}{ newline\$ "%" write\$ newline\$ }
function{format}
{ duplicate\$ empty\$                   % field present?
    { pop\$ }                           % no: do nothing but cleanup
    { duplicate\$ 'names :=             %'memorise namelist in variable "names"
      num.names\$ duplicate\$
      #$maxnames >                      % too many names in list?
        { pop\$ #$truncnames }          %   yes, truncate.
        'skip\$                         %'  no, keep them all
      if\$
      'numnames :=                      %'save number of names
      #0                                % start index
      { duplicate\$ numnames < }        % test for "while\$"
      { #1 +                            % next name
        duplicate\$ names swap\$        %   get name list and index
        "{ll}" format.name\$            %   format curr. name
        "others" =                      %   et al part?
          'skip\$                       %'    yes, do not output
          { $namefmtcmd }               %     no: format all
        if\$
      } while\$ pop\$                   % loop until index is 0
    }
  if\$
}
function{default.type}{pauthor format pauthor empty\$ $altedit $addedit if\$ }
function{article}{default.type} function{book}{default.type}
function{booklet}{default.type} function{inbook}{default.type}
function{incollection}{default.type} function{inproceedings}{default.type}
function{conference}{default.type} function{manual}{default.type}
function{mastersthesis}{default.type} function{misc}{default.type}
function{phdthesis}{default.type} function{proceedings}{default.type}
function{techreport}{default.type} function{unpublished}{default.type}
read iterate{call.type\$}
END
    # ... and here comes perl again.
    close BSTFILE;
}

# Now we have decided on our .bst file and can finish the temporary .aux file
# we prepared for BibTeX.

print AUXFILE "\\bibstyle{$bst}\n";
close AUXFILE;

# if we have written to the temporary database, make sure BibTeX can find it.

$ENV{$bibenv}="$cwdpath$pathsep" . (exists $ENV{$bibenv} ? $ENV{$bibenv} : "") if $explicits;

# We now give BibTeX all the citation labels. In return we get a file whose
# lines in turn contain an author name and a label of a work of that author.
# The format the author names are given are determined by the BibTeX style file
# $bst.bst.

print STDERR `bibtex $tmp`;
die "BibTeX error. Aborting leaving all temporary files $tmp.*\n" if $?;

# if things went well, we can delete all these temporary files made for BibTeX.
# The generated .bst file is kept if the user wishes so (-k option).

unlink "$bst.bst" if ($bst eq $tmp && !(exists $opt{'k'}));
unlink "$tmp.aux","$tmp.bib";

# Decide wether pages or citation labels go to the index

my %Lab2Ent=%Lab2Pag;
if($usenum){
    %Lab2Ent=%Lab2Num;
    %Lab2Bib=();
}else{
    %Lab2Num=();
}

# We have now labels associated with page numbers and labels associated with
# author names (in the file generated by the BibTeX run). Now we can bring
# together the previous two main steps and compute for each author the pages
# where she is cited. In draft mode, we also remember for each author the
# labels of her works and the pages where these works are cited.

my ($firstauthor,$firstsortname,$Lab,$PrevLab)=("","","","");
open(BIBFILE,"$tmp.bbl") || die "Can't open $tmp.bbl\n";
my $author;
while($author=&readtosep()){
    map s/[\[\]]//g, $author;
    my $sortname=&readtosep();
    my $LabOrEmpty=&readtosep();
    if($LabOrEmpty){
	$PrevLab=$Lab;
	$Lab=$LabOrEmpty;
    }
    my $printname=$author;
    if($Lab ne $PrevLab){
	@{$LeadAutPag{$author}}{@{$Lab2Ent{$Lab}}}="" if exists $Lab2Ent{$Lab};
	@{$LeadAutPag{$author}}{@{$Lab2Ent{'*'}}}=""  if exists $Lab2Ent{'*'};
	@{$LeadAutPag{$author}}{@{$Lab2Bib{$Lab}}}="" if exists $Lab2Bib{$Lab};
	$firstauthor=$author;
	$firstsortname=$sortname;
    }else{
        ${$Aut2First{$sortname}}{$firstsortname}="";
        if($see){
            $printname="{$author}$see\\aifirst{$firstauthor}";
	    $sortname="$sortname$see$firstsortname";
	}
    }
    @{$Aut2Pag{$printname}}{@{$Lab2Ent{$Lab}}}="" if exists $Lab2Ent{$Lab};
    @{$Aut2Pag{$printname}}{@{$Lab2Ent{'*'}}}=""  if exists $Lab2Ent{'*'};
    @{$Aut2Pag{$printname}}{@{$Lab2Bib{$Lab}}}="" if exists $Lab2Bib{$Lab};
    @{$Aut2Bib{$printname}}{@{$Lab2Bib{$Lab}}}="" if exists $Lab2Bib{$Lab};
    push @{$Aut2Lab{$printname}},$Lab if (exists $opt{'d'}) and $LabOrEmpty;
    $printname{$sortname}=$printname;
    $plainname{$sortname}=$author;
}
close BIBFILE;
unlink "$tmp.blg","$tmp.bbl";

# Last not least, output the results, properly sorted if needed.

open(AIFILE,">$output") || die "Can't create author index file $output\n";

# convert page type order into numerical values
my ($i,$page);
$PageTypeOrder{$i}=length($PageTypeOrder) while($i=chop $PageTypeOrder);
# create table that relates page to page order info
for $page (keys %pnlist){
    $PageOrder{$page}=&parse_pagenumber($page);
}

if($also){
    my $coauthorname;
    for $coauthorname (keys %Aut2First){
        my @namelist;
	my $sortname;
	for $sortname (sort keys %{$Aut2First{$coauthorname}}){
	    push @namelist, $printname{$sortname};
	}
        # we rely on the fact that " " is alphabetically first, so that
        # $justbehind will end up directly after $coauthorname
        my $justbehind=$coauthorname." ";
        my $pseudoname=$also.join($alsosep, @namelist);
        $printname{$justbehind}=$pseudoname;
        $plainname{$justbehind}=$pseudoname;
    }
}

if(exists $opt{'i'}){
    # generate file for makeindex: leave the work for makeindex.
    my $name;
    for $name (keys %printname){
	my $author=$printname{$name};
        my $page;
	for $page (keys %{$Aut2Pag{$author}}){
	    print AIFILE "\\indexentry{$name\@$author}{$page}\n";
	}
    }
}else{
    # sort result, throw away duplicate page numbers and generate LaTeX file.
    print AIFILE "\\begin{theauthorindex}\n";
    my ($prevfirstchar,$prevplain,$name)=("","","");
    for $name (sort keys %printname){
	my $thisfirstchar=substr($name,0,1);
	if($thisfirstchar ne $prevfirstchar){
	    print AIFILE "\\indexspace\n" if $prevfirstchar;
	    $prevfirstchar=$thisfirstchar;
	}
	my $author=$printname{$name};
	my $plain=$plainname{$name};
	my $rep=$author;
	if($Aut2Lab{$author}){
	    print AIFILE "% @{$Aut2Lab{$author}}\n" if(exists $opt{'d'});
	    $rep=($plain eq $prevplain) ? "\\airep$author" : "\\aitop$author"
		if $plain ne $author;
	    $prevplain=$plain;
	    $authorcount++;
        }
	print AIFILE "\\item[$rep]";
	$Aut2Bib{$author}={} unless exists $Aut2Bib{$author};
	$LeadAutPag{$author}={} unless exists $LeadAutPag{$author};
	my %b2p=%{$Aut2Bib{$author}};
	my %lp=%{$LeadAutPag{$author}};
	my %pagerep;
        my $page;
	for $page (keys %{$Aut2Pag{$author}}){
	    my $prep=$page;
	    $prep="\\aifirstpage{$prep}" if exists $lp{$page};
	    $prep="\\aibibpage{$prep}"   if exists $b2p{$page};
	    $pagerep{$page}=$prep;
	}
	my $res=&compressed_pages($Aut2Pag{$author},\%pagerep);
	print AIFILE " \\aipages{$res}\n";
    }
    print AIFILE "\\end{theauthorindex}\n";
    if(exists $opt{'d'}){	# in draft mode, include some statistics
	print AIFILE "%\n% $citationcount citations ";
	print AIFILE "of $workscount distinct works\n";
	print AIFILE "% $explicits times \\aimention\n" if $explicits;
	print AIFILE "% $authorcount different authors\n";
    }
}
close AIFILE;

# merge mini indices into the .bbl-Files if it was requested.

if($miniindex){
    map s/aux$/bbl/,@SAVEARGV;
    my $file;
    for $file (@SAVEARGV){
	open(BBLINPUT,$file) || next;
	open(BBLHELP,">$tmp.bbl") || die "Can't create temp file $tmp.bbl\n";
	my $currlabel="";
	while(<BBLINPUT>){
	    if(/\\bibitem(\[.*\])*\{(.*)\}|\\end\{thebibliography\}/){
		if($currlabel){
		    my $pagelist=&pages_for_label($currlabel);
		    print BBLHELP "\\bibindex{$pagelist}\n";
		}
		$currlabel=$2;
		print BBLHELP "$_";
	    }elsif(/\\bibindex\{(.*)\}/){
		if($currlabel){
		    my $pagelist=&pages_for_label($currlabel);
		    print BBLHELP "$`\\bibindex{$pagelist}$'";
		}
		$currlabel="";
	    }else{
		print BBLHELP "$_" if "$_" ne "\n";
	    }
	}
	close BBLHELP;
	close BBLINPUT;
	rename "$tmp.bbl","$file" || die "Can't replace old $file\n";
    }
}

# auxiliary functions

# convert roman numeral string to integer

sub romanvalue
{
    local($_)=shift;  tr/IVXLCDM/ivxlcdm/;
    my %romandigits =  ("i", 1,   "v", 5,   "x", 10, "l", 50,
			"c", 100, "d", 500, "m", 1000);
    my ($i,$sum,$prev)=("",0,1);
    while($i=chop){
	my $this=$romandigits{$i};
	$sum=$sum+(($this<$prev) ? -$this : $this);
	$prev=$this;
    }
    return $sum;
}

# convert letter to numeric value

sub alphavalue
{
    local($_)=@_; tr/A-Z/a-z/;
    return ord($_)-ord("a");
}

# split page number in components and replace each component by a number for
# the page type and the page number as an integer.

sub parse_pagenumber
{
    local($_)=@_;
    my $res="";
    while($_){
	s/^[^\\A-Za-z0-9]*//;
	if(exists $PageTypeOrder{'n'} && s/(^\d+)//){
	    $res.="$PageTypeOrder{'n'}".sprintf "%0.6d",$1;
	}elsif(exists $PageTypeOrder{'R'} && 
	       s/^\\uppercase\s*\{([ivxlcdm]+)\}//){
	    $res.="$PageTypeOrder{'R'}".sprintf "%0.4d",&romanvalue($1);
	}elsif(exists $PageTypeOrder{'R'} && s/(^[IVXLCDM]+)//){
	    $res.="$PageTypeOrder{'R'}".sprintf "%0.4d",&romanvalue($1);
	}elsif(exists $PageTypeOrder{'A'} && s/(^[A-Z])//){
	    $res.="$PageTypeOrder{'A'}".sprintf "%0.2d",&alphavalue($1);
	}elsif(exists $PageTypeOrder{'r'} && s/(^[ivxlcdm]+)//){
	    $res.="$PageTypeOrder{'r'}".sprintf "%0.4d",&romanvalue($1);
	}elsif(exists $PageTypeOrder{'a'} && s/(^[a-z])//){
	    $res.="$PageTypeOrder{'a'}".sprintf "%0.2d",&alphavalue($1);
	}else{
	    s/^.//;
	}
    }
    return $res;
}

# test wether 2 pages are subsequent

sub a_follows_b
{
    my ($i,$j)=@PageOrder{@_};
    $i++;
    return($i eq $j);
}

# make a sorted, maybe compressed, list of pages

sub compressed_pages
{
    my ($A,$B)=@_;
    my %pages=%{$A};
    my %pagerep=%{$B};
    my ($prevpage,$pendrep,$res,$pagepending,$page)=("","","","","");
    for $page (sort { $PageOrder{$a} cmp $PageOrder{$b} } keys %pages){
	# handle compression of page ranges.  At the moment, we
	# also compress ranges that might be displayed in different faces
	my $pagerep=(exists $pagerep{$page}) ? $pagerep{$page} : $page;
	if($prevpage){
	    if(!$nocompress and &a_follows_b($prevpage,$page)){
		$pendrep=$pagepending ? "--$pagerep"
		  : ($twoabbrev ? "$twoabbrev" : ", $pagerep");
		$pagepending=1;
	    }else{
		$res.=($pagepending ? "$pendrep" : "").", $pagerep";
		$pagepending=0;
	    }
	}else{
	    $res.="$pagerep";
	}
	$prevpage=$page;
    }
    $res.="$pendrep" if $pagepending;
    return $res;
}

sub pages_for_label
{
    my ($label)=@_;
    if(exists $Lab2Pag{$label}){
	my (%pages,%empty);
	@pages{@{$Lab2Pag{$label}}}="";
	return &compressed_pages(\%pages,\%empty);
    }
    return "";
}

sub readtosep
{
    my $sum="";
    while(<BIBFILE>){
	return $sum if(/^%$/);
	chop; s/%$//;
	$sum.=$_;
    }
}