#!/usr/bin/perl use strict; use warnings; use Data::Dumper; my %operons; my %ontologyHash = (); my $gene; my @line; my @sliceOntology; #MAKE HASH OPERONS => GENES (FROM OPERONSETTEXT.TXT FILE) open (DATA, "OperonSetText") || die "Couldnt open operonSet"; while () { chomp; my ($operon, undef, undef, $genes) = split(/\s+/); my @items = split(/,/, $genes); foreach (@items) { my $gene = (split(/\|/))[0]; $operon = (split(/-/,$operon))[0]; push @{$operons{$operon}}, $gene; } } print "$_:", join(',', @{$operons{$_}}), "\n" foreach (sort keys %operons); #print Dumper(%operons); #MAKE HASH GENES => ONTOLOGY (FROM ONTOLOGY.TXT FILE) open DATA2, "ontology.txt"; while () { chomp; @line = split(/\t+/); #Get tab deliminted components into an array. my (undef, undef, $gene) = @line; #Get gene name from line array. #print $gene . " "; @sliceOntology = @line; @sliceOntology = @sliceOntology[12..$#sliceOntology]; #Keep only those after the 10th one. #print " $#sliceOntology @sliceOntology \n "; push @{$ontologyHash{$gene}}, @sliceOntology; #$ontologyHash{$gene} = \@sliceOntology; } #print "$_:", join(',', @{$ontologyHash{$_}}), "\n" foreach (sort keys %ontologyHash); #print Dumper(%ontologyHash); #MAKE HASH OPERONS => ONTOLOGY by going through Uri Alon's file, matching operon terms #from that file, with operon terms in the Operon=>Gene hash above. open (DATA3, "coliInterFullNames.txt") || die "Couldnt open 3"; my %opToGo; my $count = 0; my $opMatch; while () { chomp; my ($num, $op) = split(/\s+/); #print "$num $op\n"; #WE NEED TO CONSIDER THE MATCHING PROBLEMS HERE AND FIX THEM. #my $opMatch = $op; #(1) If this is a acs-yjcH-actP for example, and op = acs, we still want it to match #therefore, remove anything after the - in the earlier function that creates #operon names. # (2) When $op = ada_alkB, this did not match ada-alkB. Since above, we are removing anything after the #-, we can also remove anything after the _ in op. #print $op, "= PRE SPLIT "; $opMatch = (split(/_/, $op))[0]; #print $opMatch , "= SPLIT \n"; if($operons{$opMatch}){ my @genes = @{$operons{$opMatch}}; foreach (@genes){ if($ontologyHash{$_}){ #print " @{$ontologyHash{$_}} , " ; push @{$opToGo{$num}}, @{$ontologyHash{$_}} ; } } } else{ print "OPERON FROM ALON FILE: ",$op, " OR " , $opMatch, " DID not match OperonSetText operonName\n"; #print $count += 1; #die; #(3) When $op = araFG_araH_1H_2 and $opMatch = araFG this does not match araFGH # in the operons file. We should adjust this so that if a substring in Alon matches a larger string in # operon, thats fine too. # Therefore, if there is no match, # pattern match through the whole of operon keys for this substring. # if a larger string is found that contains it, then turn opMatch into that larger, # key. foreach my $t (sort (keys (%operons))){ if($t =~ /$opMatch/){ print "BUT $t contains $opMatch\n"; ############# $opMatch = $t; #print $opMatch , "= SPLIT \n"; if($operons{$opMatch}){ my @genes = @{$operons{$opMatch}}; foreach (@genes){ if($ontologyHash{$_}){ #print " @{$ontologyHash{$_}} , " ; push @{$opToGo{$num}}, @{$ontologyHash{$_}} ; } } } ############ } } } } my $k; my $multi; my $cata; my $cata2; my %opToCat; my $ma = 0; foreach $k (sort (keys (%opToGo))) { my @a = @{$opToGo{$k}} ; foreach my $mat (@a){ #print " *$mat* "; #Search the multifun2go file for the multifun catgaory associated with this word. open (DATA4, "multifun2go.txt") || die "Couldnt open 3"; my @line2 = ; $ma = 0; foreach $multi (@line2) { #print " $multi "; if ($multi =~ /\b$mat\b/i) { #Case insensitive matching is required because there are often capitalized entries #in the multiFun2Go file, but non-capitalized in the ontology.txt file from Ecocyc. $cata = $`; if($cata) { if( $cata =~/MultiFun:(\d).+/) { #print "$k => $1 \n"; push @{$opToCat{$k}}, $1 ; $ma = $ma +1; } } } } if($ma == 0){ print "NO MATCH BETWEEN $mat AND MULTIFUNC FUNCTION FILE OR RETRIVAL OF CATAGORY FAILED: Operon = $k, \n"; #die; } } #print "\n"; } #print Dumper(%opToCat); #OpToCat contains a hash of operon number to go Top Level MultiFun Number; #Now write an EXCEL flat file containing the data we have isolated. #Do this in the order of Uri Alon's members list. open (DATA3, "coliInterFullNames.txt") || die "Couldnt open 3"; open (MYFILE, '>>finalData.txt'); while () { chomp; my ($num, $op) = split(/\s+/); if($opToCat{$num}){ print MYFILE "$num\t$op\t", join(',', @{$opToGo{$num}}), "\t", join(',', @{$opToCat{$num}}). "\n"; #print "$num\t$op\t", join(',', @{$opToGo{$num}}), "\t", join(',', @{$opToCat{$num}}). "\n"; } else { print MYFILE "$num\t$op\tNO ONTOLOGY DATA\n"; print "$num\t$op\tNO ONTOLOGY DATA\n"; } } close (MYFILE);