cat top5perchrom.gtexv8.sorted.tsv | perl -ne 'chomp; if(!defined($f)) { $f=$_; print "highest_alt_sample\thighest_alt_sample_coverage\thighest_alt_sample_coverage_ratio\t$f\n"; next; } $f=$_; @f=split(/\t/,$f,-1); @f0=split(/,/,$f[17],-1); shift(@f0); $maxr=-1.0; $maxc=-1; $maxs=-1; for $e (@f0) { ($s,$c,$r)=split(/:/,$e,-1); if($r > $maxr) { $maxr=$r; $maxc=$c; $maxs=$s; }} print "$maxs\t$maxc\t$maxr\t$f\n";' > top5perchrom.gtexv8.sorted.wmax.tsv cat top5perchrom.gtexv8.sorted.wmax.tsv | perl -ne 'BEGIN { open(IN,") { chomp($line); @f=split(/\t/,$line,-1); $jid=$f[0]; pop(@f); local @a=(); for $e ($f[1],$f[2],$f[3],$f[5],$f[6],$f[7],$f[8]) { push(@a,$e); } @f1=splice(@f,11); for $e (@f1) { push(@a,$e); } $h{$jid}=\@a; } close(IN); print "chromosome\tstart\tend\tstrand\tannotated\tleft_motif\tright_motif\tsamples\tsamples_count\tcoverage_sum\tcoverage_avg\tcoverage_median\t"; } chomp; if(!defined($f)) { $f=$_; print "$f\n"; next; } $f=$_; @f=split(/\t/,$f,-1); $jid=$f[3]; $a=$h{$jid}; if(!defined($a)) { print STDERR "missing for $f\n"; next; } $a0=join("\t",@$a); print "$a0\t$f\n";' > top5perchrom.gtexv8.sorted.wmax.wanchor.tsv 2>e0 tail -n+2 top5perchrom.gtexv8.sorted.wmax.wanchor.tsv | perl -ne 'chomp; $f=$_; @f=split(/\t/,$f,-1); ($c,$s,$e,$o)=@f; $s--; $i=join("|",@f); print "$c\t$s\t$e\t$i\t0\t$o\n";' | LC_ALL=C sort -u | LC_ALL=C sort -t$'\t' -k1,1 -k2,2n -k3,3n > top5perchrom.gtexv8.sorted.wmax.wanchor.bed bedtools intersect -sorted -a top5perchrom.gtexv8.sorted.wmax.wanchor.bed -b gencode.v46.annotation.bed -wao -s > top5perchrom.gtexv8.sorted.wmax.wanchor.overlapping_gv46 cat top5perchrom.gtexv8.sorted.wmax.wanchor.overlapping_gv46 | perl -ne 'chomp; $f=$_; @f=split(/\t/,$f,-1); $f0=$f[3]; $f0=~s/\|/\t/g; if($f[6] eq ".") { print "\t\t\t$f0\n"; next; } $f1=$f[9]; $f1=~s/^\|//; $f1=~s/\|$//; $f1=~s/\|/\t/g; print "$f1\t$f0\n";' | LC_ALL=C sort -u | LC_ALL=C sort -k2,2 > top5perchrom.gtexv8.sorted.wmax.wanchor.wgv46genes.tsv bedtools closest -D ref -s -t first -sorted -a top5perchrom.gtexv8.sorted.wmax.wanchor.bed -b gencode.v46.annotation.bed > top5perchrom.gtexv8.sorted.wmax.wanchor.closest_first_overlapping_gv46 cat top5perchrom.gtexv8.sorted.wmax.wanchor.closest_first_overlapping_gv46 | perl -ne 'chomp; $f=$_; @f=split(/\t/,$f,-1); $f0=$f[3]; $f0=~s/\|/\t/g; $f1=$f[9]; $f1=~s/^\|//; $f1=~s/\|$//; $f1=~s/\|/\t/g; $d=pop(@f); print "$f1\t$d\t$f0\n";' | LC_ALL=C sort -u | LC_ALL=C sort -k2,2 > top5perchrom.gtexv8.sorted.wmax.wanchor.wgv46genes.tsv cat top5perchrom.gtexv8.sorted.fullannotationallowed.wmax.wanchor.wgv46genes.tsv | cut -f 1-11,13-36,38- > top5perchrom.gtexv8.sorted.fullannotationallowed.wmax.wanchor.wgv46genes.cut.tsv cut -f 2,5-8,10-11,21,27-29,31,33-34 top5perchrom.gtexv8.sorted.fixed.wmax.wanchor.wgv46genes.tsv > top5perchrom.gtexv8.sorted.fixed.wmax.wanchor.wgv46genes.tsv.cut cat all.gtexv8.sorted.fixed.wmax.wanchor.wgv46genes.tsv.cut | perl -ne 'BEGIN { %fh=("GT-AG"=>1,"AT-AC"=>1,"AT-AG"=>1); %rh=("CT-AC"=>1,"GT-AT"=>1,"CT-AT"=>1); } chomp; $f=$_; @f=split(/\t/,$f,-1); $d0=abs($f[2]-$f[9]); $d1=abs($f[3]-$f[10]); if($d0 > 6 || $d1 > 6) { print "BAD_DIST\t$f\n"; next; } $m0=$f[5]."-".$f[6]; $m1=$f[12]."-".$f[13]; $both_canonical=((defined($fh{$m0}) || defined($rh{$m0})) && (defined($fh{$m1}) || defined($rh{$m1}))); $strand_equal=$f[4] eq $f[11]; if($both_canonical == 1 && $strand_equal != 1) { print "BAD_STRAND\t$f\n"; next; } if($both_canonical == 1) { $anchor=$f[2]; $alt=$f[9]; $anchor=$f[3] if(defined($rh{$m0})); $alt=$f[10] if(defined($rh{$m1})); if($anchor == $alt) { print "BAD_COORD\tCONSISTENCY\t$f\n"; next; }}' > all.gtexv8.sorted.fixed.wmax.wanchor.wgv46genes.tsv.cut.failed_checks /usr/bin/time -v /bin/bash -x finalize_jxns.sh /srv/nvme2/deploy/gtexv2expanded/data/ggsplicing/filtered100/postprocessed.removalFilters3and4/top5perchrom.gtexv8.sorted.allannotatedallowed.tsv > top5perchrom.gtexv8.sorted.allannotatedallowed.tsv.run0 2>&1 &