#!/usr/bin/perl use strict; use warnings; use Data::Dumper; my %operons; my %ontologyHash = (); my $gene; my @line; my @sliceOntology; my %opToGo; my $count = 0; my $opMatch; my %alonToGene; my %alonToOntology; my %alonToMulti; open (MYFILE, '>AlonNumberToMultiFunc.txt'); open (MYFILE2, '>MotifRandomOntologies.txt'); #################################################################################################################################### #MAKE HASH OPERONS => GENES (FROM OPERONSETTEXT.TXT FILE , AND CHECK ITS COMPLETENESS. open (DATA, "OperonSetText") || die "Couldnt open operonSet"; while () { #Go through OperonSetText file chomp; my ($operon, undef, undef, $genes) = split(/\s+/); #Extract columns of this file. my @items = split(/,/, $genes); #Extract gene parts. foreach (@items) { my $gene = (split(/\|/))[1]; #Split genes $operon = (split(/-/,$operon))[0]; #Take only the first part of the operon name as the KEY. push @{$operons{$operon}}, $gene; #Store the split genes under this first part operon name key. } } print "$_:", join(',', @{$operons{$_}}), "\n" foreach (sort keys %operons); #print Dumper(%operons); #################################################################################################################################### #NOW MAKE OPERON ALON => OPERON FROM ECOCYC. AND CHECK ITS COMPLETENESS. open (DATA3, "coliInterFullNamesMod.txt") || die "Couldnt open 3"; while () { #Go through URI ALONs number operon FILE,. chomp; my ($num, $op) = split(/\s+/); #print $op, "= PRE SPLIT "; $opMatch = (split(/_/, $op))[0]; #Set opMatch to the operon name IN ALONs FILE before any underscores. #print $opMatch , "= SPLIT \n"; if($operons{$opMatch}){ #print $opMatch, " " , @{$operons{$opMatch}} , " SUCCESS AT FIRST \n" ; #If it matches the operon name from the operons hash print this. push @{$alonToGene{$num}}, @{$operons{$opMatch}}; #Store the split genes under this first part operon name key. } else{ #If it does not match the operon in the hash then $count = 0; foreach my $t (sort (keys (%operons))){ #Go through the operons keys. ($t). if($t =~ /$opMatch/){ #Test if any of the opMatches from Alon are CONTAINED in the key, but not the key. #print "BUT $t contains $opMatch\n"; ############# #If it is, then $opMatch = $t; #print $opMatch , "= SPLIT \n"; if($operons{$opMatch}){ #print $opMatch, " " , @{$operons{$opMatch}} , " SUCCESS AT SECOND \n" ; push @{$alonToGene{$num}}, @{$operons{$opMatch}}; #Store the split genes under this first part operon name key. $count++; } ############ } } if ($count == 0){ #Try something else. #atoC should match atoSC, i.e. any simple letter + any order of capital letters after it. #the operon should match if just one capital letter is shared between the two. #Seperate $opMatch into simple and capital parts. #print $opMatch , "*********************\n"; $opMatch =~ /([a-z]+)([0-9A-Z]*)/; #print " NEW OPMATCH part 1 = $1 part 2 = $2\n"; my $part1 = $1; my $part2 = $2; #Now go through the operons matching for the first part + at least one letter from the second part. foreach my $t (sort (keys (%operons))){ if(( $t =~ /$part1/) && ($t =~ /[$part2]/)){ #print "*****************BUT $t contains it h\n"; ############# $opMatch = $t; #print $opMatch , "= SPLIT \n"; if($operons{$opMatch}){ #print $opMatch, " " , @{$operons{$opMatch}} , " SUCCESS AT THIRD\n" ; push @{$alonToGene{$num}}, @{$operons{$opMatch}}; #Store the split genes under this first part operon name key. $count++; } ############ } } if($count == 0){ #Some operon names from OperonSetText have been lost because they appear after the hyphan. #We should go through operonSetText searching for these matches. open (DATA, "OperonSetText") || die "Couldnt open operonSet"; #print $opMatch , "*********************\n"; while () { chomp; my ($operon, undef, undef, $genes) = split(/\s+/); my @operonAlt = (split(/-/,$operon)); #print "@operonAlt *\n"; foreach my $t (@operonAlt){ if($t =~ /$opMatch/){ $opMatch = $operonAlt[0]; #print $opMatch , "= SPLIT \n"; if($operons{$opMatch}){ #print $opMatch, " " , @{$operons{$opMatch}} , " SUCCESS AT FOURTH******\n" ; push @{$alonToGene{$num}}, @{$operons{$opMatch}}; #Store the split genes under this first part operon name key. $count++; } } } } if($count == 0){ print $opMatch, " --------------- OPS NOT MATCHED \n" ; } } } } } #Now we have a list of genes pointed to by the operon number in Alon's file. #Next we must go through each of these genes produce MULTIFUN ONTOLOGY #First get the gene -> MULTIFUN HASH. open DATA2, "EColiGeneToMultiFun.txt"; while () { chomp; @line = split(/\t+/); #Get tab deliminted components into an array. my ($gene, $multiFunc) = @line; #Get gene name from line array. #print $gene, " ", $multiFunc , "\n"; push @{$ontologyHash{$gene}}, $multiFunc; } #print "$_:", join('[SPACE]', @{$ontologyHash{$_}}), "\n" foreach (sort keys %ontologyHash); #print Dumper(%ontologyHash); #################################################################################################################################### #NO GO THROUGH THE ALON GENE NAME FILE GETTING ONTOLOGY FOR EACH OPERON. THESE SHOULD BE WRITTEN INTO ANOTHER HASH foreach my $t (sort (keys (%alonToGene))){ #Go through the operons keys. ($t). #print " $t => @{$alonToGene{$t}} \n"; foreach my $g (@{$alonToGene{$t}}){ if($ontologyHash{$g}){ #print " $g ==> @{$ontologyHash{$g}} \n"; push @{$alonToOntology{$t}}, @{$ontologyHash{$g}}; } else{ # print " $g : "; # print "Uri Alon type gene not in OntologyHash NEED TO SEARCH FOR IT\n"; } } } #print Dumper(%alonToOntology); ##################################################################################################################################### #NOW MAKE A FILE FROM THE URI ALON NUMBER TO THE MULTIFUN CATAGORY NUMBER Y.X SECOND LEVEL. foreach my $t (sort (keys (%alonToOntology))){ #Go through the operons keys. ($t). #Go through each element of foreach (@{$alonToOntology{$t}}){ my $cata = $_; if( $cata =~/(\d+\.\d+).+/) { print "$t => $1 found in $cata\n"; push @{$alonToMulti{$t}}, $1; } } } #print Dumper(%alonToMulti); #foreach my $t (sort (keys (%alonToMulti))){ # # print MYFILE "$t @{$alonToMulti{$t}} \n"; # #} open (DATA3, "coliInterFullNamesMod.txt") || die "Couldnt open 3"; while () { #Go through URI ALONs number operon FILE,. chomp; my ($num, $op) = split(/\s+/); if( $alonToMulti{$num}){ print "$num \t $op \t @{$alonToMulti{$num}} \n"; print MYFILE "$num \t $op \t @{$alonToMulti{$num}} \n"; } else{ print "$num \t $op \t 0 \n"; print MYFILE "$num \t $op \t 0\n"; } } close(MYFILE); # ##################################################################################################################################### #We now generate random combinations of files containing the operons. open DATA, "data.txt"; my(@data) = ; splice (@data, 0,5); my @final_data; foreach $_ (@data){ chomp $_; my @cols = split /\t/, $_; push @final_data, [ @cols ]; #print $_; } my $d = 0; for $a (1..$#final_data-1) { for $b (0..2){ $d++; #print $d, ": " , $final_data[$a]->[$b] . " " ; if($alonToMulti{$final_data[$a]->[$b]}){ print MYFILE2 " [" , "@{$alonToMulti{$final_data[$a]->[$b]}}" , "] " ; print " " , "@{$alonToMulti{$final_data[$a]->[$b]}}" , " " ; } else { print MYFILE2 " [0] " ; #" $final_data[$a]->[$b], $a, $b, \n"; print " [0] " ; #" $final_data[$a]->[$b], $a, $b, \n"; #die; } } #print "\n"; print MYFILE2 "\n"; print "\n"; } print MYFILE2 "#Original Data 1\n"; #Print out to a file the original data table. #Create a hash table of key value pairs from 0 to MAX_GENE number. my %hashMix; foreach $_ (1..423) { $hashMix{$_} = $_; #print "$hashMix{$_} "; } #Shuffle the values in the hash table, by pairwise shuffling, i.e. choose two keys randomly #and swap their values. my $temp_value; my $temp_value2; my $randomNum; my $randomNum2; my($key); my($value); my $a; my $b; my $shuffle; for $shuffle (1..1000){ #How can we know how good a randomization this is? foreach (1..100000){ $randomNum = int( rand(423)) + 1; $temp_value = $hashMix{$randomNum}; $randomNum2 = int( rand(423)) + 1; $temp_value2 = $hashMix{$randomNum2}; $hashMix{$randomNum} = $temp_value2; $hashMix{$randomNum2} = $temp_value; } #while ( ($key, $value) = each %hashMix) { # print "$key => $value\n"; #} #Now replace the numbers in the @final_data array with the hash values. my $c= 0; for $a (1..$#final_data-1) { for $b (0..2){ $c++; #print $final_data[$a]->[$b] ; #print " "; $final_data[$a]->[$b] = $hashMix{$final_data[$a]->[$b]}; if($alonToMulti{$final_data[$a]->[$b]}){ print MYFILE2 " [" , "@{$alonToMulti{$final_data[$a]->[$b]}}" , " ]" ; print "[ " , "@{$alonToMulti{$final_data[$a]->[$b]}}" , " " ; } else { print " [0] " ; #" $final_data[$a]->[$b], $a, $b, \n"; print MYFILE2 " [0] " ; #" $final_data[$a]->[$b], $a, $b, \n"; } } print MYFILE2 "\n"; print "\n"; } print MYFILE2 "#Random Data $shuffle\n"; } close (MYFILE2); #####################################################################################################################################