# Copyright (C) 2011 Bo Peng (bpeng@mdanderson.org) # Distributed under GPL. see # # Please refer to http://varianttools.sourceforge.net/Format/New for # a description of the format of this file. [format description] description=Import vcf variant=chr,%(pos)s,%(ref)s,%(alt)s genotype=%(geno)s variant_info=%(var_info)s genotype_info=%(geno_info)s # variants with identical chr,pos,ref will be collapsed. export_by=chr,%(pos)s,%(ref)s [DEFAULT] format_string= format_string_comment=FORMAT string that is outputted in the 9th column of exported vcf file. Please specified an appropriate value corresponding to the parameter --geno_info because this column cannot be automatically determined. geno_info= geno_info_comment=Genotype information fields. No genotype field is imported by default. You may edit it into, for example, "geno_info=GD,GQ,PL", if the .vcf format field looks like "GT:GD:GQ:PL". Please check the FORMAT string of your .vcf file to determine available fields to be imported. geno=GT geno_comment=Field to extract genotype from .vcf file. You can set it to safe_GT if genotype is not the first field in the genotype columns of your .vcf file. var_info= var_info_comment=Variant information fields to be imported. Please check the INFO column of your vcf file for available fields. phase_sep='/' phase_sep_comment=Seperator used to output genotype, / for unphased and | for phased. This parameter is needed because 'vtools import' does not save phase information for each genotype. wildtype_code=('0',) wildtype_code_comment=How wildtype homozygotes are imported. These genotypes are by default imported as GT=0. They will be discarded if you set this parameter to None. id= id_comment=The field to output to the third (ID) column of the vcf file. You can use a id field (if you imported or updated it from a vcf file), another ID field (e.g. rsname) if available, or '' to output all missing values. pos=pos pos_comment=Field for position. To export indel, set it to 'pos-length(upstream)' ref=ref ref_comment=Field for reference allele alt=alt alt_comment=Field for alternative allele qual= qual_comment=Field for quality score filter= filter_comment=Field for filter status info= info_comment=Field for info status [field formatter] # # NOTE: if multiple records are collapsed, they are passed as tuples. # If no formatter is defined, the first value will be used in output. # fmt_GT=GenoFormatter(style='vcf') fmt_DP=IfMulti(Formatter('DP={[0]}'), Formatter('DP={}')) # if there are multiple alternative alleles, join them by ',' # # not sure if we need to keep both JoinRecords and JoinFields fmt_alt=JoinRecords(',') fmt_raw_alt=JoinRecords(',') [col_1] field=chr # remove fields with empty chromosome adj=ValueOfNull(None) comment=chromosome [col_2] field=pos comment=position [col_3] field=%(id)s adj=ValueOfNull('.') comment=id [col_4] field=%(ref)s comment=reference allele [col_5] field=%(alt)s comment=alternative alleles [col_6] field=%(qual)s adj=ValueOfNull('0') comment=quality [col_7] field=%(filter)s adj=ValueOfNull('PASS') comment=filter [col_8] field=%(info)s adj=ValueOfNull('.'), JoinFields(';') comment=variant info [col_9] field= adj=Constant("%(format_string)s") comment=genotype format [col_10] field=%(geno)s,%(geno_info)s adj=JoinFields(':') comment=genotype [chr] index=1 type=VARCHAR(20) adj=RemoveLeading('chr') comment=Chromosome [pos] index=2 type=INTEGER NOT NULL comment=1-based position [raw_pos] index=2 type=INTEGER comment=1-based position [id] index=3 type=VARCHAR(48) adj=Nullify('.') comment=variant id (rs number or other IDs) [ref] index=4 type=VARCHAR(255) comment=Reference allele, '-' for insertion. [alt] index=5 adj=CheckSplit() type=VARCHAR(255) comment=Alternative allele, '-' for deletion. [raw_ref] index=4 type=VARCHAR(255) comment=Reference allele, without remove common leading and ending nucleotide. [raw_alt] index=5 adj=CheckSplit() type=VARCHAR(255) comment=Alternative allele, without remove common leading and ending nucleotide. [upstream] index=4,5 adj=CommonLeading() type=VARCHAR(255) comment=Common leading alleles of ref and alt alleles stored in .vcf file. This field is only available for indels. [downstream] index=4,5 adj=CommonEnding() type=VARCHAR(255) comment=Common ending alleles of ref and alt alleles stored in .vcf file, common leading is extracted before common ending. This field is only available for indels. [qual] index=6 type=FLOAT comment=phred-scaled quality score for the assertion made in ALT. High QUAL scores indicate high confidence calls. [filter] index=7 type=VARCHAR(255) comment=PASS if this position has passed all filters, i.e. a call is made at this position. Otherwise, if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. [GT] index=10: type=INTEGER adj=VcfGenotype(default=%(wildtype_code)s) comment=Gentoype coded as 0 (ref ref), 1 (ref alt), 2 (alt alt) or -1 (alt1, alt2), assuming GT is the first FORMAT field in the .vcf file. Missing genotype will be dropped. [safe_GT] # This vcf genotype extractor uses format string and genotype to # extract genotype. Although the format string should not be needed # because the genotype field should be the first one, one of my dataset # does not following this rule. The performance penalty is significant # 4.36 -> 3.03 for 50k records. index=9, 10: adj=VcfGenoFromFormat(default=%(wildtype_code)s) type=INTEGER comment=Gentoype coded as 0 (ref ref), 1 (ref alt), 2 (alt alt) or -1 (alt1, alt2). This field checks the FORMAT string and extract genotype accordingly. Missing genotype will be dropped. [info] index=8 type=VARCHAR(255) comment=Raw INFO column in the vcf file. This may be further splitted into various specified info fields, such as DP, etc. [DP] index=8 adj=ExtractValue('DP=', ';') type=INTEGER comment=DP fields in INFO. Total depth of samples in a vcf file. [DP_geno] # Passing 'GT:DP\t0/0:64' index=9, 10: adj=FieldFromFormat('DP', ':') type=INTEGER comment=DP fields in INFO. Depth for each sample in a vcf file.