#!/usr/bin/perl -w use strict; use Math::CDF qw(:all); my $prob = pnorm(1.96); ### START OF INPUT ############################################################################################################################################ # In this step binomial statistics is performed on the output file (or files) of Analyze_mutation.pl to separate sequencing errors from statistically significant modifications. # Usage: $ perl Binomial_analysis.pl INPUT_file # OR # $ perl Binomial_analysis.pl INPUT_file1 INPUT_file2 INPUT_file3 ... # Binomial_analysis.pl can receive several input files and merge them together # Previous steps: # 1) process_reads.pl # 2) bowtie -n 1 -e 50 -a -m 1 --best --strata --phred33-quals --trim3 2 BOWTIE_INDEXES_PATHWAY FASTQ_FILE > BOWTIE_OUTPUT_FILE # 3) Analyze_mutation.pl my @file_names=@ARGV; # the output file or files of Analyze_mutation.pl open FILE_miRNA_Bowtie, "<", "Mouse_pre_miRNA_aligned_against_the_genome.txt" or die $!; # miRNA aligned to genome #open FILE_miRNA_Bowtie, "<", "Human_pre_miRNA_aligned_against_the_genome.txt" or die $!; # miRNA aligned to genome my $p_mismatch=0.001; # the sequencing error probability my $max_position_to_check=19; # ignore last two bases my $BH_or_Bonferroni="Bonferroni"; # multiple testing correction: a choice between "BH" (Benjamini-Hochberg correction) or "Bonferroni" (Bonferroni correction) my $P_value=0.05; # the P-value required after multiple testing correction (this is the FDR if Benjamini-Hochberg correction is chosen) my $minimal_number_of_reads=5; # the bionomial test will be performed in all locations with at least this number of reads my $maximal_percent_change=100; # maximum percent change allowed, setting this value to 100 indicate that even locations in which all the sequencing reads are different from the genomic sequence are considered my $first_N_bases=2; # If the mismatch is in the first N bases of the miRNA (as defined by $first_N_bases) AND the mismatch is in isomir with expression level which is significantly below the 'main' isomir, the mismatch is ignored (as we observed 5' adenylation and uridylation in low-abundance isomirs) my $fold_change_between_isomirs=10; # If the mismatch is in the first N bases of the miRNA AND the mismatch is in isomir with expression level which is significantly below the 'main' isomir (as defined by $fold_change_between_isomirs), the mismatch is ignored my $file_test="test.txt"; # de-bugging output file ### END OF INPUT ############################################################################################################################################# my $total_number_of_files=$#ARGV+1; my $string; my $global_flag; my $number_of_expressed_miRNA; my $number_of_multiple_tests; my $counter; my $current_p_value; my @p_values; my @sorted_p_values; my $length_p_values; my $i; my $number_of_reads; my %hash_all_counts; my $miR_name; my $current_base; my @all_miRs; my @all_locations; my %hash_location_pvalue; my $key; my @array_expression; my %hash_max_expression; my @array_values; my $flag_start_of_read; my $flag_order_of_magnitude_difference; my $global_running; my $global_counter; my %hash_convert_pre_miRNA_location_to_mature_location; my $temp_miR_name; my $previous_global_counter; my %hash_5p_or_3p; my %hash_all_letters; my %temp_hash_all_letters; my $temp_edited_levels; my $temp_edited_letter; my $total_expression_levels; my $edited_expression_levels; my %processed_p_values; my @all_processed_p_values; my %BH_adjusted_p_values; my @part_of_all_processed_p_values; my $multiple_tests_corrected_p_value; my @fields; my $name_of_miRNA; my $strand; my $chr; my $genomic_start; my $read_seq; my $tmp_location; my %hash_convert_position_in_miRNA_to_genomic_position; my $tmp_position; ### read the miRNA bowtie file into hash while ($string=) { # reading the data from the alignment file @fields=(); @fields=split(/\t/,$string); $name_of_miRNA=$fields[0]; $strand=$fields[1]; $chr=$fields[2]; $genomic_start=$fields[3]; $read_seq=$fields[4]; if ($strand eq "+") { for ($i=0;$i", $file_test or die $!; for ($global_running=0;$global_running<$total_number_of_files;$global_running++) { open FILE_R, "<", $file_names[$global_running] or die $!; $counter=0; $global_counter=0; $global_flag=1; $string=; while ($global_flag==1) { if ($string =~ /^[1,2].+?length:\s(\w+)/) { $counter++; $global_counter++; $hash_convert_pre_miRNA_location_to_mature_location{$miR_name}{$global_counter}=$counter; $number_of_reads=$1-3; $current_base=substr($string,2,1); if ($counter<=$max_position_to_check) { if (!($string=)) { $global_flag=0; } if ($string !~ /length:/) { if ($string =~ /([0-9]+)\t([0-9]+)\t([0-9]+)\t([0-9]+)/) { if (exists $hash_all_counts{$miR_name}{$global_counter}{"RNA"}) { $hash_all_counts{$miR_name}{$global_counter}{"A"}=$hash_all_counts{$miR_name}{$global_counter}{"A"}+$1; $hash_all_counts{$miR_name}{$global_counter}{"U"}=$hash_all_counts{$miR_name}{$global_counter}{"U"}+$2; $hash_all_counts{$miR_name}{$global_counter}{"G"}=$hash_all_counts{$miR_name}{$global_counter}{"G"}+$3; $hash_all_counts{$miR_name}{$global_counter}{"C"}=$hash_all_counts{$miR_name}{$global_counter}{"C"}+$4; } else { $hash_all_counts{$miR_name}{$global_counter}{"RNA"}=$current_base; $hash_all_counts{$miR_name}{$global_counter}{"A"}=$1; $hash_all_counts{$miR_name}{$global_counter}{"U"}=$2; $hash_all_counts{$miR_name}{$global_counter}{"G"}=$3; $hash_all_counts{$miR_name}{$global_counter}{"C"}=$4; } } if (!($string=)) { $global_flag=0; } if ($string =~ /^([A|U|G|C]{2})\s:/) { if (!($string=)) { $global_flag=0; } } } else { %temp_hash_all_letters=%hash_all_letters; delete($temp_hash_all_letters{$current_base}); if (exists $hash_all_counts{$miR_name}{$global_counter}{"RNA"}) { $hash_all_counts{$miR_name}{$global_counter}{$current_base}=$hash_all_counts{$miR_name}{$global_counter}{$current_base}+$number_of_reads; } else { $hash_all_counts{$miR_name}{$global_counter}{"RNA"}=$current_base; $hash_all_counts{$miR_name}{$global_counter}{$current_base}=$number_of_reads; foreach (keys %temp_hash_all_letters) { $hash_all_counts{$miR_name}{$global_counter}{$_}=0; } } } } else { if (!($string=)) { $global_flag=0; } } } elsif ($string =~ /length:/) { $global_counter++; $counter=0; if (!($string=)) { $global_flag=0; } } else { if ($string =~ /[0-9]+\t([a-zA-Z]{3}.+?)\n/) { $global_counter=0; $counter=0; $miR_name=$1; } if (!($string=)) { $global_flag=0; } } } close FILE_R; } ### asign P-value to each location in the mature/star sequence @all_miRs=keys %hash_all_counts; @array_expression=(); foreach(@all_miRs) { $previous_global_counter=0; $miR_name=$_; $temp_miR_name=$miR_name; @all_locations=keys %{$hash_all_counts{$miR_name}}; @all_locations = sort { $a <=> $b } @all_locations; foreach(@all_locations) { $global_counter=$_; if (exists $hash_all_counts{$miR_name}{$global_counter}{"RNA"}) { $current_base=$hash_all_counts{$miR_name}{$global_counter}{"RNA"}; %temp_hash_all_letters=%hash_all_letters; delete($temp_hash_all_letters{$current_base}); $temp_edited_levels=0; $temp_edited_letter="A"; foreach (keys %temp_hash_all_letters) { if ($hash_all_counts{$miR_name}{$global_counter}{$_}>=$temp_edited_levels) { $temp_edited_levels=$hash_all_counts{$miR_name}{$global_counter}{$_}; $temp_edited_letter=$_; } } $total_expression_levels=$hash_all_counts{$miR_name}{$global_counter}{"A"}+$hash_all_counts{$miR_name}{$global_counter}{"U"}+$hash_all_counts{$miR_name}{$global_counter}{"G"}+$hash_all_counts{$miR_name}{$global_counter}{"C"}; $edited_expression_levels=$temp_edited_levels; print FILE_test $miR_name,"\t",$global_counter,"\t",$current_base.$temp_edited_letter,"\t",$edited_expression_levels,"\t",$total_expression_levels,"\n"; if (($edited_expression_levels>0)&&($total_expression_levels>=$minimal_number_of_reads)) { $current_p_value=pbinom(($total_expression_levels-$edited_expression_levels),$total_expression_levels,(1-$p_mismatch)); print FILE_test $miR_name,"\t",$global_counter,"\tP-val=",$current_p_value,"\n"; } else { $current_p_value=1; } push(@p_values,$current_p_value); $hash_location_pvalue{$miR_name."\t".$global_counter."\t".$current_base.$temp_edited_letter."\t".$edited_expression_levels."\t".$total_expression_levels}=$current_p_value; if ((($global_counter-$previous_global_counter)!=1)&&($previous_global_counter!=0)) { $hash_max_expression{$temp_miR_name}=max(@array_expression); @array_expression=(); $temp_miR_name=$miR_name."-3p"; push(@array_expression,$total_expression_levels); } else { push(@array_expression,$total_expression_levels); } $previous_global_counter=$global_counter; $hash_5p_or_3p{$miR_name}{$global_counter}=$temp_miR_name; } } $hash_max_expression{$temp_miR_name}=max(@array_expression); @array_expression=(); } close FILE_test; ### calc BH-corrected P-values @sorted_p_values = sort {$a <=> $b} @p_values; $length_p_values=$#sorted_p_values+1; #print "number of tests:\t",$length_p_values,"\n"; $i=0; foreach $key (sort { $hash_location_pvalue{$a} <=> $hash_location_pvalue{$b}} keys %hash_location_pvalue) { $i=$i+1; $processed_p_values{$key}=$hash_location_pvalue{$key}*$length_p_values/$i; } @all_processed_p_values=values %processed_p_values; @all_processed_p_values = sort { $a <=> $b } @all_processed_p_values; $i=-1; foreach $key (sort { $processed_p_values{$a} <=> $processed_p_values{$b}} keys %processed_p_values) { $i=$i+1; @part_of_all_processed_p_values=@all_processed_p_values[$i..$#all_processed_p_values]; $BH_adjusted_p_values{$key}=min(@part_of_all_processed_p_values); if ($BH_adjusted_p_values{$key}>$P_value) { last; } } ### display the P-values after multiple testing correction print "#CHROM\tSTART\tEND\tSTRAND\tmiRNA_name\tLocation_inside_pre_miRNA\tMismatch_type\tNumber_of_reads_with_the_mismatch\tTotal_number_of_reads_in_this_position\tRaw_P_value\tBonferroni_P_value\tBH_P_value\n"; foreach $key (sort { $hash_location_pvalue{$a} <=> $hash_location_pvalue{$b}} keys %hash_location_pvalue) { if ($BH_or_Bonferroni eq "BH") { $multiple_tests_corrected_p_value=$BH_adjusted_p_values{$key}; } else { $multiple_tests_corrected_p_value=$hash_location_pvalue{$key}*$length_p_values; } if ($multiple_tests_corrected_p_value<=$P_value) { @array_values=split("\t",$key); if ($hash_max_expression{$hash_5p_or_3p{$array_values[0]}{$array_values[1]}}>($array_values[4]*$fold_change_between_isomirs)) { $flag_order_of_magnitude_difference=1; } else { $flag_order_of_magnitude_difference=0; } if ($hash_convert_pre_miRNA_location_to_mature_location{$array_values[0]}{$array_values[1]}<=$first_N_bases) { $flag_start_of_read=1; } else { $flag_start_of_read=0; } if (($flag_start_of_read==0)||($flag_order_of_magnitude_difference==0)) { print $hash_convert_position_in_miRNA_to_genomic_position{$array_values[0]."\t".$array_values[1]},"\t",$key,"\t",$hash_location_pvalue{$key},"\t",$hash_location_pvalue{$key}*$length_p_values,"\t",$BH_adjusted_p_values{$key},"\n"; } } else { last; } } ############################################################################################################################################# sub max { my @numbers = @_; my $max; my $running; for($running=0;$running<=$#numbers;$running++) { if (defined($numbers[$running])) { $max = $numbers[$running]; last; } } for($running=0;$running<=$#numbers;$running++) { if (defined($numbers[$running])) { if ($numbers[$running] > $max) { $max = $numbers[$running]; } } } return $max; } ############################################################################################################################################# sub min { my @numbers = @_; my $min; my $running; for($running=0;$running<=$#numbers;$running++) { if (defined($numbers[$running])) { $min = $numbers[$running]; last; } } for($running=0;$running<=$#numbers;$running++) { if (defined($numbers[$running])) { if ($numbers[$running] < $min) { $min = $numbers[$running]; } } } return $min; }