-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCreate_Cosmic.sh
More file actions
163 lines (117 loc) · 4.62 KB
/
Create_Cosmic.sh
File metadata and controls
163 lines (117 loc) · 4.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/bin/sh
#Description: Create COSMIC file compatible as input to, for example, MuTect. Good for getting updated cosmic version.
#Requires: CosmicCodingMuts.vcf.gz and CosmicNonCodingVariants.vcf.gz from http://cancer.sanger.ac.uk/cosmic and the reference genome useds .fai file
#Commandline: sh Create_Cosmic.sh <reference genome>.fai
#Author: Sebastian DiLorenzo
#Date: 2015-11-06
#Create the sortByRef.pl file
cat > sortByRef.pl << EOF
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
sub usage {
print "\nUsage:\n";
print "sortByRef.pl [--k POS] [--tmp dir] INPUT REF_DICT\n\n";
print " Sorts lines of the input file INFILE according\n";
print " to the reference contig order specified by the\n";
print " reference dictionary REF_DICT (.fai file).\n";
print " The sort is stable. If -k option is not specified,\n";
print " it is assumed that the contig name is the first\n";
print " field in each line.\n\n";
print " INPUT input file to sort. If '-' is specified, \n";
print " then reads from STDIN.\n";
print " REF_DICT .fai file, or ANY file that has contigs, in the\n";
print " desired soting order, as its first column.\n";
print " --k POS : contig name is in the field POS (1-based)\n";
print " of input lines.\n\n";
print " --tmp DIR : temp directory [default=/tmp]\n\n";
exit(1);
}
my \$pos = 1;
my \$tmp = "/tmp";
GetOptions( "k:i" => \\\$pos,
"tmp=s" => \\\$tmp);
\$pos--;
usage() if ( scalar(@ARGV) == 0 );
if ( scalar(@ARGV) != 2 ) {
print "Wrong number of arguments\n";
usage();
}
my \$input_file = \$ARGV[0];
my \$dict_file = \$ARGV[1];
open(DICT, "< \$dict_file") or die("Can not open \$dict_file: \$!");
my %ref_order;
my \$n = 0;
while ( <DICT> ) {
chomp;
my (\$contig, \$rest) = split '\s';
die("Dictionary file is probably corrupt: multiple instances of contig \$contig") if ( defined \$ref_order{\$contig} );
\$ref_order{\$contig} = \$n;
\$n++;
}
close DICT;
#we have loaded contig ordering now
my \$INPUT;
if (\$input_file eq "-" ) {
\$INPUT = "STDIN";
} else {
open(\$INPUT, "< \$input_file") or die("Can not open \$input_file: \$!");
}
my %temp_outputs;
while ( <\$INPUT> ) {
my @fields = split '\s';
die("Specified field position exceeds the number of fields:\n\$_")
if ( \$pos >= scalar(@fields) );
my \$contig = \$fields[\$pos];
if ( \$contig =~ m/:/ ) {
my @loc = split(/:/, \$contig);
# print \$contig . " " . \$loc[0] . "\n";
\$contig = \$loc[0]
}
chomp \$contig if ( \$pos == scalar(@fields) - 1 ); # if last field in line
my \$order;
if ( defined \$ref_order{\$contig} ) { \$order = \$ref_order{\$contig}; }
else {
\$ref_order{\$contig} = \$n;
\$order = \$n; # input line has contig that was not in the dict;
\$n++; # this contig will go at the end of the output,
# after all known contigs
}
my \$fhandle;
if ( defined \$temp_outputs{\$order} ) { \$fhandle = \$temp_outputs{\$order} }
else {
#print "opening \$order \$\$ \$_\n";
open( \$fhandle, " > \$tmp/sortByRef.\$\$.\$order.tmp" ) or
die ( "Can not open temporary file \$order: \$!");
\$temp_outputs{\$order} = \$fhandle;
}
# we got the handle to the temp file that keeps all
# lines with contig \$contig
print \$fhandle \$_; # send current line to its corresponding temp file
}
close \$INPUT;
foreach my \$f ( values %temp_outputs ) { close \$f; }
# now collect back into single output stream:
for ( my \$i = 0 ; \$i < \$n ; \$i++ ) {
# if we did not have any lines on contig \$i, then there's
# no temp file and nothing to do
next if ( ! defined \$temp_outputs{\$i} ) ;
my \$f;
open ( \$f, "< \$tmp/sortByRef.\$\$.\$i.tmp" );
while ( <\$f> ) { print ; }
close \$f;
unlink "\$tmp/sortByRef.\$\$.\$i.tmp";
}
EOF
#Execute the Cosmic creation
gunzip CosmicCodingMuts.vcf.gz
gunzip CosmicNonCodingVariants.vcf.gz
grep "^#" CosmicCodingMuts.vcf > VCF_Header
grep -v "^#" CosmicCodingMuts.vcf > Coding.clean
grep -v "^#" CosmicNonCodingVariants.vcf > Noncoding.clean
#cat Coding.clean Noncoding.clean | sort -gk 2,2 | awk '{print "chr"$0}' | perl sortByRef.pl --k 1 - /gulo/glob/seba/Apps/GenomeAnalysisTK/2.8/hg19/ucsc.hg19.fasta.fai > Cosmic.hg19
cat Coding.clean Noncoding.clean | sort -gk 2,2 | awk '{print "chr"$0}' | perl sortByRef.pl --k 1 - $1 > Cosmic.txt
cosmic=`grep -o -P 'COSMICv.{0,2}' VCF_Header`
cat VCF_Header Cosmic.txt > $cosmic.vcf
#clean-up directory
rm *.clean VCF_Header sortByRef.pl Cosmic.txt