#!/usr/bin/perl - w

#################################################################################################
#												#
# entrezparse.pl - a perl script to parse entrez flatfiles into csv format			#
# 										 		#
#												#
# takes the file name as the only argument, call with ./entrezparse.pl filein fileout		#
# to parse the file "filein" and save the output to a csv file "fileout"			#
#												#
# fields containing ? are set to NULL (i.e. "")							#
#												#
# The code consists of a set of functions that extract and return information from particular	#
# sorts of lines, a while look which runs the relevent functions, and a special function 	#
# 'newline' which prints it all to the output file.						# 
#												#
# This code currently just works for SNP outputs, but is extendable to other formats by		#
# a) modifying or adding functions, b) altering the print order in newline and c) changing 	#
#  												#
# 												#
# Luke 13/11/2007										#
#												#
#################################################################################################

#TODO - currently ditches any LOC lines after the first for each contig - at the moment it just flags this


### This section contains the functions for each line type, which are called later - program begins below them 

sub rsfun {
	# adds the rs_id fields: rs_id 
	if ($_[0] !~ /\?/) {$rs_id = $_[0];}
	$result = $rs_id;
}

sub snpfun{
	# adds the snp fields: alleles, heterozygosity, het-error
	$_[2] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$alleles = $1 ;}
	$_[4] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$het = $1 ;}
	$_[6] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$heterr = $1; $heterr =~ s/\s+$//;}
	$result = ",".$alleles.",".$het.",".$heterr;
}

sub valfun{
	# adds the validation fields: validated
	$_[2] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$validation = $1 ;}
	$result = ",".$validation;
}

sub celerafun{
	# adds the celera build fields: chromosome, chromosome position, assembly code, contig start, contig end, locType, strand
	$_[4] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$celerachr = $1;}
	$_[6] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$celerachrpos = $1;}
	if ($_[8] !~ /\?/) {$celeraass = $_[8] ;}
	$_[10] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$celeractgstart = $1;}
	$_[12] =~ /=(.+)$/;	
	if ($1 !~ /\?/) {$celeractgend = $1;}
	$_[14] =~ /=(.+)$/;	
	if ($1 !~ /\?/) {$celeraloctype = $1;}
	$_[16] =~ /=(.+)$/;	
	if ($1 !~ /\?/) {$celerastrand = $1;$celerastrand =~ s/\s+$//;}
	$result = ",".$celerachr.",".$celerachrpos.",".$celeraass.",".$celeractgstart.",".$celeractgend.",".$celeraloctype.",".$celerastrand;
}

sub reffun{
	# adds the reference build fields: chromosome, chromosome position, assembly code, contig start, contig end, locType, strand
	$_[4] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$refchr = $1;}
	$_[6] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$refchrpos = $1;}
	if ($_[8] !~ /\?/) {$refass = $_[8];}
	$_[10] =~ /=(.+)$/;	
	if ($1 !~ /\?/) {$refctgstart = $1;}
	$_[12] =~ /=(.+)$/;	
	if ($1 !~ /\?/) {$refctgend = $1;}
	$_[14] =~ /=(.+)$/;	
	if ($1 !~ /\?/) {$refloctype = $1;}
	$_[16] =~ /=(.+)$/;	
	if ($1 !~ /\?/) {$refstrand = $1;$refstrand  =~ s/\s+$//;}
	$result = ",".$refchr.",".$refchrpos.",".$refass.",".$refctgstart.",".$refctgend.",".$refloctype.",".$refstrand;
}

sub cellocfun{
	# adds the celera build location fields: gene, locus, function
	$cellocflag = 0;
	if ($_[2] !~ /\?/) {$celeragene = $_[2];}
	$_[4] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$celeralocusid = $1;}
	$_[6] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$celerafunction = $1;$celerafunction  =~ s/\s+$//;}
	$result = ",".$celeragene.",".$celeralocusid.",".$celerafunction;
}

sub reflocfun{
	# adds the celera build location fields: gene, locus, function
	$reflocflag = 0;
	if ($_[2] !~ /\?/) {$refgene = $_[2];}
	$_[4] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$reflocusid = $1;}
	$_[6] =~ /=(.+)$/;
	if ($1 !~ /\?/) {$reffunction = $1;$reffunction =~ s/\s+$//}
	$result = ",".$refgene.",".$reflocusid.",".$reffunction;
}

sub newline{
	# puts it all together, then adds a new line
	print OUT $rsstring.",".$sscount.$valstring.$snpstring.$celerastring.",".$celextralocflag.$celeralocstring.$referecestring.",".$refextralocflag.$refencelogstring."\n";

	# reset the flags
	$celeraflag = 0;
	$referenceflag = 0;
	$rsflag = 0;
	$snpflag = 0;
	$valflag = 0;
	$cellocflag = 0;
	$celextralocflag = 0;
	$reflocflag = 0;
	$refextralocflag = 0;

	# resent all the entries
	$rs_id = "";
	$alleles = "";
	$het = "";
	$heterr = "";
	$validation = "";
	$celerachr = "";
	$celerachrpos = "";
	$celeraass = "";	
	$celeractgstart = "";
	$celeractgend = "";
	$celeraloctype = "";
	$celerastrand = "";
	$refchr = "";
	$refchrpos = "";
	$refass = "";	
	$refctgstart = "";
	$refctgend = "";
	$refloctype = "";
	$refstrand = "";
	$sscount = 0;
	$refgene = ",";
	$reflocusid = ",";
	$reffunction = ",";
	$celeragene = ",";
	$celeralocusid = ",";
	$celerafunction = ",";

	$rsstring = "";
	$valstring = ",";
	$snpstring = ",,,";
	$celerastring = ",,,,,,,";
	$celeralocstring = ",,,";
	$referecestring = ",,,,,,,";
	$refencelogstring = ",,,";
	
	$k += 1;
	if (($k % 1000) == 0) {print "$k \n";};	

}


########## Program starts here #########################

## This does a quick error check for the number of arguments

$starttime = time();

if($#ARGV != 1){
	die "Error: Needs exactly 2 arguments";
}

# open a read pipe to the input file, and a write-pipe to the output file, or die trying

open(IN,$ARGV[0])|| die "Error: cannot open file $ARGV[0]";
open(OUT,">$ARGV[1]") || die "Error: cannot write to file $ARGV[1]";


# print the headers to the first line of the output file

print OUT "rs.id,no.of.ss.nos,validated,alleles,heterozygosity,het.error,celera.chromosome.number,celera.chromosome.position,";
print OUT "celera.assembly.code,celera.contig.start,celera.contig.end,celera.loctype,celera.strand,multiple.celera.functions?,";
print OUT "celera.gene,celera.locus.id,celera.function,reference.chromosome.number,reference.chromosome.position,";
print OUT "reference.assembly.code,reference.contig.start,reference.contig.end,reference.loctype,reference.strand,";
print OUT "multiple.reference.functions?,reference.gene,reference.locus.id,reference.function\n";

#Intialize all the flags

$endflag = 1; # starts at one to prevent program created a blank first line
$celeraflag = 0;
$referenceflag = 0;
$rsflag = 0;
$snpflag = 0;
$valflag = 0;
$cellocflag = 0;
$celextralocflag = 0;
$reflocflag = 0;
$refextralocflag = 0;

# Initialize all the entries

$rs_id = "";
$alleles = "";
$het = "";
$hererr = "";
$validation = "";
$celerachr = "";
$celerachrpos = "";
$celeraass = "";	
$celeractgstart = "";
$celeractgend = "";
$celeraloctype = "";
$celerastrand = "";
$refchr = "";
$refchrpos = "";
$refass = "";	
$refctgstart = "";
$refctgend = "";
$refloctype = "";
$refstrand = "";
$sscount = 0; # this is a counter, checks the number of ss-names in the entry
$refgene = ",";
$reflocusid = ",";
$reffunction = ",";
$celeragene = ",";
$celeralocusid = ",";
$celerafunction = ",";

$rsstring = "";
$snpstring = ",,,";
$valstring = ",,,";
$celerastring = ",,,,,,,";
$celeralocstring = ",,,";
$referecestring = ",,,,,,,";
$refencelogstring = ",,,";

# Parse the file - run through the lines, and call functions dependand of the line
# $endflag makes sure that there is only one new line per entry

$k = 0;

while(<IN>){
	@array = split(/ /);

	# blank line
	if (/^[\r,\n]/ && $endflag == 1) {next ;} # go to next iteration if nothing has changed
	if (/^[\r,\n]/) {&newline ; $endflag = 1;} # otherwise finalise the line line

	# rs number	
	if (/^rs/ && $rsflag == 1) {die "Error: multiple rs numbers for $rs_id";}
	if (/^rs/) {$rsstring = &rsfun(@array); $endflag = 0; $rsflag = 1;}
	
	# ss number (just counts them, nothing else)
	if (/^ss/) {$sscount += 1}

	# SNP details
	if (/^SNP/ && $snpflag == 1) {die "Error: multiple SNP entries for $rs_id";}
	if (/^SNP/) {$snpstring = &snpfun(@array); $endflag = 0; $snpflag = 1;}

	# validation information
	if (/^VAL/ && $valflag == 1) {die "Error: multiple validatione entries for $rs_id";}
	if (/^VAL/) {$valstring = &valfun(@array); $endflag = 0; $valflag = 1;}

	# contig info - calls different functions for Celera and reference contigs
	if (/^CTG/) {
		if ($array[2] =~ /Celera/ && $celeraflag == 1) {$endflag = 0; next ;} #die "Error: Multiple Celera contigs for $rs_id";}
		if ($array[2] =~ /reference/ && $referenceflag == 1) {die "Error: Multiple reference contigs for $rs_id";}
		if ($array[2] =~ /Celera/) {
			$celerastring = &celerafun(@array); 
			$celeraflag = 1; # switch between reference and celera flags
			$refflag = 0;
		} elsif ($array[2] =~ /reference/) {
			$referecestring = &reffun(@array);
			$refflag = 1; # and vice versa
			$celeraflag = 0;
		} else {$celeraflag = 0; $refflag = 0;} # die "Error: Unrecognised contig at $rs_id"}
		$endflag = 0;}

	# Location information
	if (/^LOC/) {
		if ($celeraflag == 1) {
			if ($cellocflag == 1) {$celextralocflag = 1; # flag if muliple locations for build but don't read past the first
			} else {
			$celeralocstring = &cellocfun(@array);
			$cellocflag = 1;}
		} elsif ($refflag == 1) {
			if ($reflocflag == 1){$refextralocflag = 1;
			} else {$refencelogstring = &reflocfun(@array);
			$reflocflag = 1;}
		} #elsif ($celeraflag != 1 && $reflag != 1) {die "Error: Location information without associated contig at $rs_id";}
		$endflag = 0;}
}

if ($rs_id !~ /^$/) {&newline};

# close the pipes
close(IN);
close(OUT);

$endtime = time();
$elapsedtime = $endtime - $starttime;
print ("start at $starttime, end at $endtime, took $elapsedtime. Thank you for playing. \n");

