diff --git a/src/ga4gh/vrs/extras/annotator/vcf.py b/src/ga4gh/vrs/extras/annotator/vcf.py index 37f0d475..3dd70dcf 100644 --- a/src/ga4gh/vrs/extras/annotator/vcf.py +++ b/src/ga4gh/vrs/extras/annotator/vcf.py @@ -41,6 +41,18 @@ class FieldName(str, Enum): ERROR_FIELD = "VRS_Error" +# String-typed INFO fields where pysam incorrectly converts None → b"" +# (empty bytes) instead of the VCF missing value ".". Integer/Float fields +# are unaffected because pysam uses proper BCF missing sentinels for those. +_STRING_FIELDS = frozenset( + { + FieldName.IDS_FIELD, + FieldName.STATES_FIELD, + FieldName.ERROR_FIELD, + } +) + + # VCF character escape map VCF_ESCAPE_MAP = str.maketrans( { @@ -293,11 +305,20 @@ def annotate( if output_vcf_path and vcf_out: for k in additional_info_fields: - # Convert "" and None values (but not 0) to None. - # Pysam outputs "." for missing values. - record.info[k.value] = [ - None if v in ("", None) else v for v in vrs_field_data[k.value] - ] + # pysam correctly converts None → "." for Integer/Float + # INFO fields, but for String fields it converts None → + # "" (empty bytes), violating the VCF spec. Work around + # by using the literal string "." for String-typed fields. + if k in _STRING_FIELDS: + record.info[k.value] = [ + "." if v in ("", None) else v + for v in vrs_field_data[k.value] + ] + else: + record.info[k.value] = [ + None if v in ("", None) else v + for v in vrs_field_data[k.value] + ] vcf_out.write(record) vcf.close() diff --git a/src/ga4gh/vrs/extras/translator.py b/src/ga4gh/vrs/extras/translator.py index 1fe5e350..91c45532 100644 --- a/src/ga4gh/vrs/extras/translator.py +++ b/src/ga4gh/vrs/extras/translator.py @@ -142,8 +142,14 @@ def translate_to(self, vo: models._VariationBase, fmt: str, **kwargs) -> list[st kwargs: ref_seq_limit Optional(int): If vo.state is a ReferenceLengthExpression, and `ref_seq_limit` is specified, and `fmt` is `spdi`, the reference sequence is included in the SPDI expression if it is below the limit Otherwise only the length of the reference sequence is included. If the limit is None, the reference sequence is always included. In all cases, the alt sequence is included. Default is 0 (never include reference sequence). + :raise NotImplementedError: If `fmt` is not supported """ - t = self.to_translators[fmt] + try: + t = self.to_translators[fmt] + except KeyError as e: + msg = f"{fmt} is not supported" + raise NotImplementedError(msg) from e + return t(vo, **kwargs) ############################################################################ @@ -154,7 +160,7 @@ def hgvs_tools(self) -> HgvsTools: """Instantiate and return an HgvsTools instance""" return HgvsTools(self.data_proxy) - def _from_vrs(self, var: dict) -> models._VariationBase | None: + def _from_vrs(self, var: dict, **kwargs) -> models._VariationBase | None: # noqa: ARG002 """Convert from dict representation of VRS JSON to VRS object""" if not isinstance(var, Mapping): return None @@ -573,7 +579,7 @@ def _from_hgvs( ) copies = kwargs.get("copies") - if copies: + if copies is not None: cnv = models.CopyNumberCount(location=location, copies=copies) else: copy_change = kwargs.get("copy_change") diff --git a/tests/extras/data/test_vcf_expected_altsonly_output.vcf b/tests/extras/data/test_vcf_expected_altsonly_output.vcf index 8d74d7ca..3b7f2ab5 100644 --- a/tests/extras/data/test_vcf_expected_altsonly_output.vcf +++ b/tests/extras/data/test_vcf_expected_altsonly_output.vcf @@ -240,7 +240,7 @@ chr19 82664 . C T 50 PASS platforms=2;platformnames=10X,PacBio;datasets=2;datase chr19 284350 . CA C 50 PASS platforms=4;platformnames=Illumina,10X,PacBio,CG;datasets=4;datasetnames=HiSeqPE300x,10XChromiumLR,CCS15kb_20kb,CGnormal;callsets=5;callsetnames=HiSeqPE300xGATK,10XLRGATK,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes;datasetsmissingcall=CCS15kb_20kb,IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;difficultregion=GRCh38_AllHomopolymers_gt6bp_imperfectgt10bp_slop5,GRCh38_SimpleRepeat_imperfecthomopolgt10_slop5;VRS_Allele_IDs=ga4gh:VA.a04jFsNg0bS0RMIWjKWSbwJS4_vp7S6x;VRS_Starts=284350;VRS_Ends=284366;VRS_States=AAAAAAAAAAAAAAA;VRS_Lengths=15;VRS_RepeatSubunitLengths=1 GT:PS:DP:ADALL:AD:GQ 0/1:.:422:117,101:81,75:356 chr19 289464 . T TCACGCCTGTAATCC 50 PASS platforms=4;platformnames=Illumina,PacBio,CG,10X;datasets=4;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR;callsets=6;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,CCS15kb_20kbDV,10XLRGATK;datasetsmissingcall=IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.ySvDptXfHB_9WEfu78v32DzBXJfwGgO7;VRS_Starts=289464;VRS_Ends=289466;VRS_States=CACGCCTGTAATCCCA;VRS_Lengths=.;VRS_RepeatSubunitLengths=. GT:PS:DP:ADALL:AD:GQ 0/1:.:518:94,98:116,137:785 chr19 28946400 . T C 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.uV5O4M9zpiwk6sftOd-EDvtw_pkSAvdf;VRS_Starts=28946399;VRS_Ends=28946400;VRS_States=C;VRS_Lengths=.;VRS_RepeatSubunitLengths=. GT:PS:DP:ADALL:AD:GQ 1/1:.:874:0,275:115,378:502 -chr19 490414 . ACT A 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_10XLRGATK_filt;VRS_Allele_IDs=ga4gh:VA.lok7a3lot_cvUyw626otpJi4yxk0X07v;VRS_Starts=490414;VRS_Ends=490416;VRS_States;VRS_Lengths=0;VRS_RepeatSubunitLengths=2 GT:PS:DP:ADALL:AD:GQ 0/1:.:821:163,158:239,220:1004 -chr19 54220024 . G *,A 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=,ga4gh:VA.I7J3i1B36BACEUINcTwEh7uMv3I-PXT1;VRS_Starts=.,54220023;VRS_Ends=.,54220024;VRS_States=,A;VRS_Lengths=.,.;VRS_RepeatSubunitLengths=.,. GT:PS:DP:ADALL:AD:GQ 1/2:.:45:0,20,25:0,20,25:99 +chr19 490414 . ACT A 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_10XLRGATK_filt;VRS_Allele_IDs=ga4gh:VA.lok7a3lot_cvUyw626otpJi4yxk0X07v;VRS_Starts=490414;VRS_Ends=490416;VRS_States=.;VRS_Lengths=0;VRS_RepeatSubunitLengths=2 GT:PS:DP:ADALL:AD:GQ 0/1:.:821:163,158:239,220:1004 +chr19 54220024 . G *,A 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=.,ga4gh:VA.I7J3i1B36BACEUINcTwEh7uMv3I-PXT1;VRS_Starts=.,54220023;VRS_Ends=.,54220024;VRS_States=.,A;VRS_Lengths=.,.;VRS_RepeatSubunitLengths=.,. GT:PS:DP:ADALL:AD:GQ 1/2:.:45:0,20,25:0,20,25:99 chr19 54220999 . A T 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Error=Reference mismatch at GRCh38:chr19 position 54220998-54220999 (input gave 'A' but correct ref is 'T') GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 -chr19 54221654 . T A,P 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.Zzlc24htmBV1HZZzWYgPD2_GfMInkrZu,;VRS_Starts=54221653,.;VRS_Ends=54221654,.;VRS_States=A,;VRS_Lengths=.,.;VRS_RepeatSubunitLengths=.,. GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 +chr19 54221654 . T A,P 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.Zzlc24htmBV1HZZzWYgPD2_GfMInkrZu,.;VRS_Starts=54221653,.;VRS_Ends=54221654,.;VRS_States=A,.;VRS_Lengths=.,.;VRS_RepeatSubunitLengths=.,. GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 diff --git a/tests/extras/data/test_vcf_expected_output.vcf b/tests/extras/data/test_vcf_expected_output.vcf index fd781c5a..da94feb4 100644 --- a/tests/extras/data/test_vcf_expected_output.vcf +++ b/tests/extras/data/test_vcf_expected_output.vcf @@ -240,7 +240,7 @@ chr19 82664 . C T 50 PASS platforms=2;platformnames=10X,PacBio;datasets=2;datase chr19 284350 . CA C 50 PASS platforms=4;platformnames=Illumina,10X,PacBio,CG;datasets=4;datasetnames=HiSeqPE300x,10XChromiumLR,CCS15kb_20kb,CGnormal;callsets=5;callsetnames=HiSeqPE300xGATK,10XLRGATK,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes;datasetsmissingcall=CCS15kb_20kb,IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;difficultregion=GRCh38_AllHomopolymers_gt6bp_imperfectgt10bp_slop5,GRCh38_SimpleRepeat_imperfecthomopolgt10_slop5;VRS_Allele_IDs=ga4gh:VA.xgtXGA3ZkV1WgMc6eD9l64fX27S_TScW,ga4gh:VA.a04jFsNg0bS0RMIWjKWSbwJS4_vp7S6x;VRS_Starts=284349,284350;VRS_Ends=284351,284366;VRS_States=CA,AAAAAAAAAAAAAAA;VRS_Lengths=2,15;VRS_RepeatSubunitLengths=2,1 GT:PS:DP:ADALL:AD:GQ 0/1:.:422:117,101:81,75:356 chr19 289464 . T TCACGCCTGTAATCC 50 PASS platforms=4;platformnames=Illumina,PacBio,CG,10X;datasets=4;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR;callsets=6;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,CCS15kb_20kbDV,10XLRGATK;datasetsmissingcall=IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.nqqTUy-a2gssemOmJb4CJv-HNuFAmGrO,ga4gh:VA.ySvDptXfHB_9WEfu78v32DzBXJfwGgO7;VRS_Starts=289463,289464;VRS_Ends=289464,289466;VRS_States=T,CACGCCTGTAATCCCA;VRS_Lengths=1,.;VRS_RepeatSubunitLengths=1,. GT:PS:DP:ADALL:AD:GQ 0/1:.:518:94,98:116,137:785 chr19 28946400 . T C 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.yPr2pVvJeWHDHarhzAvOCb5Cn9UMF6a5,ga4gh:VA.uV5O4M9zpiwk6sftOd-EDvtw_pkSAvdf;VRS_Starts=28946399,28946399;VRS_Ends=28946400,28946400;VRS_States=T,C;VRS_Lengths=1,.;VRS_RepeatSubunitLengths=1,. GT:PS:DP:ADALL:AD:GQ 1/1:.:874:0,275:115,378:502 -chr19 490414 . ACT A 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_10XLRGATK_filt;VRS_Allele_IDs=ga4gh:VA.aje4-hx7eihWndAwfhzNq_7CZV3bRMXf,ga4gh:VA.lok7a3lot_cvUyw626otpJi4yxk0X07v;VRS_Starts=490413,490414;VRS_Ends=490416,490416;VRS_States=ACT,;VRS_Lengths=3,0;VRS_RepeatSubunitLengths=3,2 GT:PS:DP:ADALL:AD:GQ 0/1:.:821:163,158:239,220:1004 -chr19 54220024 . G *,A 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.LlmfhAC3gQlVQUwXWYiYjrn5V_K8vBz1,,ga4gh:VA.I7J3i1B36BACEUINcTwEh7uMv3I-PXT1;VRS_Starts=54220023,.,54220023;VRS_Ends=54220024,.,54220024;VRS_States=G,,A;VRS_Lengths=1,.,.;VRS_RepeatSubunitLengths=1,.,. GT:PS:DP:ADALL:AD:GQ 1/2:.:45:0,20,25:0,20,25:99 +chr19 490414 . ACT A 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_10XLRGATK_filt;VRS_Allele_IDs=ga4gh:VA.aje4-hx7eihWndAwfhzNq_7CZV3bRMXf,ga4gh:VA.lok7a3lot_cvUyw626otpJi4yxk0X07v;VRS_Starts=490413,490414;VRS_Ends=490416,490416;VRS_States=ACT,.;VRS_Lengths=3,0;VRS_RepeatSubunitLengths=3,2 GT:PS:DP:ADALL:AD:GQ 0/1:.:821:163,158:239,220:1004 +chr19 54220024 . G *,A 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.LlmfhAC3gQlVQUwXWYiYjrn5V_K8vBz1,.,ga4gh:VA.I7J3i1B36BACEUINcTwEh7uMv3I-PXT1;VRS_Starts=54220023,.,54220023;VRS_Ends=54220024,.,54220024;VRS_States=G,.,A;VRS_Lengths=1,.,.;VRS_RepeatSubunitLengths=1,.,. GT:PS:DP:ADALL:AD:GQ 1/2:.:45:0,20,25:0,20,25:99 chr19 54220999 . A T 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Error=Reference mismatch at GRCh38:chr19 position 54220998-54220999 (input gave 'A' but correct ref is 'T') GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 -chr19 54221654 . T A,P 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.kea5G-J1teg0iHMbgUELy-4L9lbJkgoj,ga4gh:VA.Zzlc24htmBV1HZZzWYgPD2_GfMInkrZu,;VRS_Starts=54221653,54221653,.;VRS_Ends=54221654,54221654,.;VRS_States=T,A,;VRS_Lengths=1,.,.;VRS_RepeatSubunitLengths=1,.,. GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 +chr19 54221654 . T A,P 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.kea5G-J1teg0iHMbgUELy-4L9lbJkgoj,ga4gh:VA.Zzlc24htmBV1HZZzWYgPD2_GfMInkrZu,.;VRS_Starts=54221653,54221653,.;VRS_Ends=54221654,54221654,.;VRS_States=T,A,.;VRS_Lengths=1,.,.;VRS_RepeatSubunitLengths=1,.,. GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 diff --git a/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf b/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf index c81bb519..9a50b61f 100644 --- a/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf +++ b/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf @@ -236,6 +236,6 @@ chr19 284350 . CA C 50 PASS platforms=4;platformnames=Illumina,10X,PacBio,CG;dat chr19 289464 . T TCACGCCTGTAATCC 50 PASS platforms=4;platformnames=Illumina,PacBio,CG,10X;datasets=4;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR;callsets=6;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,CCS15kb_20kbDV,10XLRGATK;datasetsmissingcall=IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.nqqTUy-a2gssemOmJb4CJv-HNuFAmGrO,ga4gh:VA.ySvDptXfHB_9WEfu78v32DzBXJfwGgO7 GT:PS:DP:ADALL:AD:GQ 0/1:.:518:94,98:116,137:785 chr19 28946400 . T C 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.yPr2pVvJeWHDHarhzAvOCb5Cn9UMF6a5,ga4gh:VA.uV5O4M9zpiwk6sftOd-EDvtw_pkSAvdf GT:PS:DP:ADALL:AD:GQ 1/1:.:874:0,275:115,378:502 chr19 490414 . ACT A 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_10XLRGATK_filt;VRS_Allele_IDs=ga4gh:VA.aje4-hx7eihWndAwfhzNq_7CZV3bRMXf,ga4gh:VA.lok7a3lot_cvUyw626otpJi4yxk0X07v GT:PS:DP:ADALL:AD:GQ 0/1:.:821:163,158:239,220:1004 -chr19 54220024 . G *,A 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.LlmfhAC3gQlVQUwXWYiYjrn5V_K8vBz1,,ga4gh:VA.I7J3i1B36BACEUINcTwEh7uMv3I-PXT1 GT:PS:DP:ADALL:AD:GQ 1/2:.:45:0,20,25:0,20,25:99 +chr19 54220024 . G *,A 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.LlmfhAC3gQlVQUwXWYiYjrn5V_K8vBz1,.,ga4gh:VA.I7J3i1B36BACEUINcTwEh7uMv3I-PXT1 GT:PS:DP:ADALL:AD:GQ 1/2:.:45:0,20,25:0,20,25:99 chr19 54220999 . A T 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Error=Reference mismatch at GRCh38:chr19 position 54220998-54220999 (input gave 'A' but correct ref is 'T') GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 -chr19 54221654 . T A,P 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.kea5G-J1teg0iHMbgUELy-4L9lbJkgoj,ga4gh:VA.Zzlc24htmBV1HZZzWYgPD2_GfMInkrZu, GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 +chr19 54221654 . T A,P 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.kea5G-J1teg0iHMbgUELy-4L9lbJkgoj,ga4gh:VA.Zzlc24htmBV1HZZzWYgPD2_GfMInkrZu,. GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 diff --git a/tests/extras/test_allele_translator.py b/tests/extras/test_allele_translator.py index 24c461e8..15f87a89 100644 --- a/tests/extras/test_allele_translator.py +++ b/tests/extras/test_allele_translator.py @@ -331,6 +331,11 @@ def test_from_invalid(tlr): ): tlr.translate_from("BRAF amplication") + with pytest.raises( + ValueError, match="Unable to parse data as beacon, gnomad, hgvs, spdi, vrs" + ): + tlr.translate_from("BRAF amplication", assembly_name="GRCh37") + @pytest.mark.vcr def test_from_beacon(tlr): @@ -973,20 +978,7 @@ def test_normalize_microsatellite_counts(tlr, case): ) -# TODO: Readd these tests -# @pytest.mark.vcr -# def test_errors(tlr): -# with pytest.raises(ValueError): -# tlr._from_beacon("bogus") -# -# with pytest.raises(ValueError): -# tlr._from_gnomad("NM_182763.2:c.688+403C>T") -# -# with pytest.raises(ValueError): -# tlr._from_hgvs("NM_182763.2:c.688+403C>T") -# -# with pytest.raises(ValueError): -# tlr._from_hgvs("NM_182763.2:c.688_690inv") -# -# with pytest.raises(ValueError): -# tlr._from_spdi("NM_182763.2:c.688+403C>T") +@pytest.mark.vcr +def test_translate_to_invalid_fmt(tlr): + with pytest.raises(NotImplementedError, match="gnomad is not supported"): + tlr.translate_to(models.Allele.model_validate(snv_output), fmt="gnomad") diff --git a/tests/extras/test_cnv_translator.py b/tests/extras/test_cnv_translator.py index 723ddfd2..a7a3fa0e 100644 --- a/tests/extras/test_cnv_translator.py +++ b/tests/extras/test_cnv_translator.py @@ -161,3 +161,15 @@ def test_from_hgvs_cn(tlr, hgvsexpr, copies, expected): """Test that _from_hgvs works correctly for copy number count""" cn = tlr._from_hgvs(hgvsexpr, copies=copies) assert cn.model_dump(exclude_none=True) == expected + + +@pytest.mark.vcr +def test_from_hgvs_cn_copies_zero(tlr): + """Test that copies=0 produces CopyNumberCount, not CopyNumberChange. + + copies=0 is a valid input (homozygous deletion), but 0 is falsy in Python + so it was previously treated as missing and fell through to CopyNumberChange. + """ + cn = tlr._from_hgvs("NC_000013.11:g.26440969_26443305del", copies=0) + assert cn.type == "CopyNumberCount" + assert cn.copies == 0