#!/usr/local/bin/perl # C+G segmentation. seq returned is already 0/1 binary # feb 11, 2001 (for CpG island) # Mar 15, 2001 ################################################################# # main # if($#ARGV != 1){ print "Usage: [program] [file] [strength]\n"; exit; } $s0 = $ARGV[1]; ############# get all information ######### %seq= &reading_seq; ############# get all information ######### # maxi is the end of the left domain. $par{ $seq{start}}= 0; $par{ $seq{end}}= 0; $num_seg=0; $d_bic = 0; %comp= ( a=>0, c=>0,); $i=0; while( defined($tmp = substr $seq{text}, $i, 1) ){ if($tmp){ $comp{$tmp}++; } $i++; if($i > $seq{length} ){ last; } } $ent = &entropy (\%comp, $seq{length}); # print "$seq{text}\n"; print "entropy=$ent\n"; print "$seq{start} ->"; print "$seq{end} "; print "length= $seq{length}\n"; $k=1; $m2ll_orig= 2*$seq{length}*$ent; $bic_orig= $m2ll_orig + log($seq{length})*$k; $min_size=3; ###################### main sub call ############### &rec_seg($seq{text}, $seq{start}, $seq{end}); ###################### main sub call ############### print "-2logL_before = $m2ll_orig\n"; print "BIC before = $bic_orig\n"; print "number of segmentation is $num_seg\n"; print "total decrease of BIC is $d_bic\n"; print "threshold for strength is $s0\n"; $bic_new = $bic_orig + $d_bic; print "BIC new= $bic_new\n"; print "\n start end length Delta(-2logL) strength\n"; @new_keys = sort {$a <=> $b} (keys %par); foreach $key (@new_keys){ if($key != 1){ $from = $to+1; $to = $key; $l= $to-$from+1; printf("%7d %7d %7d %6.3f \t%6.4f\n", $from,$to,$l,$par{$key},$par_per{$key}); } } # end of main # ################################################################# ################################################################# # sub rec_seg(): recursive segmentation. calling _12seg() ################################################################# sub rec_seg { my ($seq, $l, $r) =@_; my $length= $r-$l+1; my ($par_i, $d_2ll); ############ before recursion, 1->2 seg # the length should be at least K (or K+1) to do segmentation # if($length > $min_size){ &_12seg($seq, $l, $r, \$par_i, \$d_2ll); $thre = log($length)*2; # fixed here BIC, CpG $s = ($d_2ll-$thre)/$thre; printf("L=%d (%d - %d - %d) %8.4f >? %8.4f (%5.3f)\n", $length, $l, $par_i,$r, $d_2ll, $thre, $s); } else{ break; # print "STOP (too small) ($l $r)\n"; } ############ in recursion # only if the 1-> 2 segmentation leads to increase of BIC, # it counts, and we will continue # $strength= ($d_2ll-$thre)/$thre; if( $strength > $s0 ){ # print " [$l - $par_i- $r]"; # printf("change in -2log(L) = %5.3f > $thre \n",$d_2ll); $par {$par_i} = $d_2ll; $par_per {$par_i} = ($d_2ll-$thre)/$thre; $num_seg++; $d_bic += (-$d_2ll + $thre); my $left_seq= substr $seq, 0, $par_i-$l+1; &rec_seg($left_seq,$l,$par_i); my $right_seq= substr $seq, $par_i-$l+1, $r-$par_i; &rec_seg($right_seq,$par_i+1,$r); } else{ # print "STOP(no improvement) ($l $par_i $r)=> $strength\n"; } } ################################################################# # sub 12seg(): 1-> 2 segmentation # 1.(sub) sequence. # 2.left (absolute coordinate) # 3.right end (absolute coordinate). # 4.reference to i # 5.calculate only max 2log(L2/L1) ################################################################# sub _12seg{ my ($seq, $l, $r, $par_i_ref, $d_2ll_ref)= @_; my $i =0; my $length= $r-$l+1; my $d_2ll_max=0; my %comp= ( a=>0, c=>0, ); while( defined($tmp = substr $seq, $i, 1) ){ ####### aug-3-99: the end of a string is "\0", empty, ####### which cause hash to add one more entry if($tmp){ $comp{$tmp}++; } $i++; if($i > $length){ last; } } my %comp_left =( a=>0, c=>0,); $lleft=0; $d_2ll=0; # 2log(L2/L1) > 0 after minus before for($i=0; $i<= ($length-2); $i++){ $base = substr $seq, $i, 1; if($base){ if($comp_left{$base}){ $t1= $comp_left{$base}*log(1+ 1/$comp_left{$base}); } else{ $t1=0; } $t2=log($comp_left{$base}+1); if( ($comp{$base}-$comp_left{$base}-1) > 0 ){ $t3 =($comp{$base}-$comp_left{$base}-1)*log(1+1/($comp{$base}-$comp_left{$base}-1)); } else{ $t3=0;} if($comp{$base}> $comp_left{$base}){ $t4=log($comp{$base}-$comp_left{$base}); } else{ $t4=0;} if( $lleft){ $t5 = $lleft*log(1+1/$lleft); } else{ $t5=0; } $t6=log($lleft+1); if($lleft < $length){ $t7=($length-$lleft)*log(1-1/($length-$lleft)); } else{ $t7=0;} $t8 = log($length-$lleft-1); $t=2*($t1+$t2-$t3-$t4-$t5-$t6-$t7+$t8); $d_2ll += $t; $comp_left{ $base } ++; $lleft++; } if( $d_2ll > $d_2ll_max){ $d_2ll_max= $d_2ll; $$par_i_ref = $i; # tricky! } } # end of i-loop (going through all positions) $$par_i_ref +=$l; # absolute coordinate again! $$d_2ll_ref= $d_2ll_max; } ################################################################# # function entropy(): # 1. hash with the base counts; # 2. total number of counts ################################################################# sub entropy { my ($ref_comp, $total_count) = @_; my $ent =0; if($total_count==0){ return 0; } #count is zero foreach $key (keys %$ref_comp){ $each_count= $ref_comp->{$key}; if($each_count){ $ent += $each_count*log($each_count); } } $ent /= (-$total_count); $ent +=log($total_count); return $ent; } ################################################################# # function reading_seq: ################################################################# sub reading_seq{ my ($f)= $ARGV[0]; open(IN,"< $f"); my $seq_tmp =""; while( $line = ){ chomp $line; $line =~ s/\n//; if( $line !~ /^>/) { $seq_tmp .= $line; } } close (IN); $seq_tmp= lc($seq_tmp); # print "original seq:\n$seq_tmp\n"; $lseq=length($seq_tmp); $seq{end} =$lseq; $seq{start}=1; $seq{length}= $seq{end}-$seq{start}+1; $i=0; $seq_cg=""; $pre=0; while( defined($tmp = substr $seq_tmp, $i, 1)){ if($tmp eq "t"){ $tmp="a"; } if($tmp eq "g"){ $tmp="c"; } $seq_cg .=$tmp; $i++; } $seq{text} = substr $seq_cg, $seq{start}-1, $seq{length}; return %seq; }