在生信信息分析过程中,如果遇到特别大的fa文件,难免会对文件进行分割,但是如果只是按照条数分割,会导致分割之后的文件之间差异很大,这样会导致一些问题。因此按照文件大小均分是很有意义的。
小编写了一个perl的均分的程序,希望可以帮到大家。
#!/usr/bin/perl -wuse strict;use warnings;use Getopt::Long;use Data::Dumper;use FindBin qw($Bin $Script);use File::Basename qw(basename dirname);my $BEGIN_TIME=time();my $version="1.0.0";######################################################################################## ------------------------------------------------------------------# GetOptions# ------------------------------------------------------------------my ($fa,$num,$outdir);GetOptions( "help|?" =>\&USAGE, "fa:s"=>\$fa, "num:s"=>\$num, "od:s"=>\$outdir, ) or &USAGE;&USAGE unless ($fa and $num);$outdir||="./";`mkdir $outdir` unless (-d $outdir);my $faname=basename($fa);############################键入程序###################################################open(FA,$fa)or die $!;$/=">";my @name;my @Len;my @seq;my $totallen;while(<FA>){ chomp; next if(/^$/); my($nameinfo,@seqarray)=split(/\n/,$_); my($id)=split(/\s+/,$nameinfo); my $seq=join("",@seqarray); push @name ,$id; my $slen=length($seq); push @Len , $slen; push @seq ,$seq; $totallen+=$slen;}close(FA);my $splitsize=int($totallen/$num)+1;my $index=1;my $mylen=0;for (my $i=0; $i<scalar(@name) ; $i++){ $mylen+=$Len[$i]; open(OUT,">>$outdir/$faname.$index.fa")or die $!; if($mylen < $splitsize) { print OUT ">",$name[$i],"\n"; print OUT $seq[$i],"\n"; } else { close(OUT); $mylen=0; $index++; }}print STDOUT "\nDone. Total elapsed time : ",time()-$BEGIN_TIME,"s\n";######################################################################################## ------------------------------------------------------------------# sub function# ------------------------------------------------------------------sub GetTime {
}sub USAGE {# my $usage=<<"USAGE";Program:Version: $versionUsage: Options: -fa <file> Input fa file, forced -num <str> Key of output file, forced -od <dir> Dir of output file, default ./ -h HelpUSAGE print $usage; exit;} |
欢迎关注生信
