diff --git a/data/whatis_example.json b/data/whatis_example.json new file mode 100644 index 00000000..29b15b13 --- /dev/null +++ b/data/whatis_example.json @@ -0,0 +1 @@ +{"data":{"nodes":[{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":0,"ts_flags":1,"time":0,"child_of":[10],"parent_of":[],"x_pos_reference":-1,"label":"a","fx":150,"fy":250,"y":250,"index":0,"x":150,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":1,"ts_flags":1,"time":0,"child_of":[18],"parent_of":[],"x_pos_reference":-1,"label":"d","fx":283,"fy":250,"y":250,"index":1,"x":283,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":2,"ts_flags":1,"time":0,"child_of":[13],"parent_of":[],"x_pos_reference":-1,"label":"e","fx":328,"fy":250,"y":250,"index":2,"x":328,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":3,"ts_flags":1,"time":0,"child_of":[10],"parent_of":[],"x_pos_reference":-1,"label":"b","fx":194,"fy":250,"y":250,"index":3,"x":194,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":4,"ts_flags":1,"time":0,"child_of":[17],"parent_of":[],"x_pos_reference":-1,"label":"c","fx":239,"fy":250,"y":250,"index":4,"x":239,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":5,"ts_flags":1,"time":0,"child_of":[12],"parent_of":[],"x_pos_reference":-1,"label":"f","fx":372,"fy":250,"y":250,"index":5,"x":372,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":6,"ts_flags":1,"time":0,"child_of":[11],"parent_of":[],"x_pos_reference":-1,"label":"g","fx":417,"fy":250,"y":250,"index":6,"x":417,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":7,"ts_flags":1,"time":0,"child_of":[15],"parent_of":[],"x_pos_reference":-1,"label":"i","fx":506,"fy":250,"y":250,"index":7,"x":506,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":8,"ts_flags":1,"time":0,"child_of":[11],"parent_of":[],"x_pos_reference":-1,"label":"h","fx":461,"fy":250,"y":250,"index":8,"x":461,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":9,"ts_flags":1,"time":0,"child_of":[15],"parent_of":[],"x_pos_reference":-1,"label":"j","fx":550,"fy":250,"y":250,"index":9,"x":550,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":10,"ts_flags":0,"time":311.25976965689426,"child_of":[17],"parent_of":[0,3],"x_pos_reference":-1,"label":"","x":172,"fy":234.6153846153846,"y":234.6153846153846,"index":10,"vy":0,"vx":0,"fx":172},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":11,"ts_flags":0,"time":487.8028220685495,"child_of":[12],"parent_of":[6,8],"x_pos_reference":-1,"label":"","x":437.77289828142125,"fy":219.23076923076923,"y":219.23076923076923,"index":11,"vy":0,"vx":0,"fx":437.77289828142125},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":12,"ts_flags":0,"time":1223.6271803895934,"child_of":[16],"parent_of":[5,11],"x_pos_reference":-1,"label":"","x":397.41637985695985,"fy":203.84615384615384,"y":203.84615384615384,"index":12,"vy":0,"vx":4.6619708093942336e-11,"fx":397.41637985695985},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":13,"ts_flags":131072,"time":1246.4183161012481,"child_of":[16,19],"parent_of":[2],"x_pos_reference":2,"label":"","x":328,"fy":188.46153846153845,"y":188.46153846153845,"index":13,"vy":0,"vx":0,"fx":328},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":15,"ts_flags":0,"time":1271.2734779542789,"child_of":[22],"parent_of":[7,9],"x_pos_reference":-1,"label":"","x":529.322755682009,"fy":173.0769230769231,"y":173.0769230769231,"index":14,"vy":0,"vx":8.158757003329243e-12,"fx":529.322755682009},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":16,"ts_flags":0,"time":1659.725091972838,"child_of":[20],"parent_of":[12,13],"x_pos_reference":-1,"label":"","x":373.31328138997435,"fy":157.69230769230768,"y":157.69230769230768,"index":15,"vy":0,"vx":-2.0449017479329257e-10,"fx":373.31328138997435},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":17,"ts_flags":0,"time":1961.7599532212648,"child_of":[18],"parent_of":[4,10],"x_pos_reference":-1,"label":"","x":194.38099308073106,"fy":142.3076923076923,"y":142.3076923076923,"index":16,"vy":0,"vx":6.79103440148765e-12,"fx":194.38099308073106},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":18,"ts_flags":0,"time":4135.539932100284,"child_of":[19],"parent_of":[1,17],"x_pos_reference":-1,"label":"","x":244.8923505399351,"fy":126.92307692307692,"y":126.92307692307692,"index":17,"vy":0,"vx":-8.945102991653186e-12,"fx":244.8923505399351},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":19,"ts_flags":0,"time":6272.301763476431,"child_of":[23],"parent_of":[13,18],"x_pos_reference":-1,"label":"","x":287.7165695894524,"fy":111.53846153846155,"y":111.53846153846155,"index":18,"vy":0,"vx":0,"fx":287.7165695894524},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":20,"ts_flags":131072,"time":7008.788482970518,"child_of":[22,23],"parent_of":[16],"x_pos_reference":16,"label":"","x":373.3132813901788,"fy":96.15384615384615,"y":96.15384615384615,"index":19,"vy":0,"vx":0,"fx":373.3132813901788},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":22,"ts_flags":0,"time":8592.848809367675,"child_of":[24],"parent_of":[15,20],"x_pos_reference":-1,"label":"","x":458.049985157943,"fy":80.76923076923075,"y":80.76923076923075,"index":20,"vy":0,"vx":8.623578517941155e-11,"fx":458.049985157943},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":23,"ts_flags":0,"time":10524.123281839178,"child_of":[24],"parent_of":[19,20],"x_pos_reference":-1,"label":"","x":326.7442210897065,"fy":65.38461538461537,"y":65.38461538461537,"index":21,"vy":0,"vx":0,"fx":326.7442210897065},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":24,"ts_flags":0,"time":15165.157197158467,"child_of":[],"parent_of":[22,23],"x_pos_reference":-1,"label":"","x":397.8856001292887,"fy":50,"y":50,"index":22,"vy":0,"vx":8.095707437760779e-12,"fx":397.8856001292887}],"links":[{"id":0,"source":10,"source_time":311.25976965689426,"target":0,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":3,"region_fraction":1,"stroke":"#053e4e"},{"id":1,"source":10,"source_time":311.25976965689426,"target":3,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":0,"region_fraction":1,"stroke":"#053e4e"},{"id":2,"source":11,"source_time":487.8028220685495,"target":6,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":8,"region_fraction":1,"stroke":"#053e4e"},{"id":3,"source":11,"source_time":487.8028220685495,"target":8,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":6,"region_fraction":1,"stroke":"#053e4e"},{"id":4,"source":12,"source_time":1223.6271803895934,"target":5,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":11,"region_fraction":1,"stroke":"#053e4e"},{"id":5,"source":12,"source_time":1223.6271803895934,"target":11,"target_time":487.8028220685495,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":5,"region_fraction":1,"stroke":"#053e4e"},{"id":6,"source":13,"source_time":1246.4183161012481,"target":2,"target_time":0,"bounds":"0.0-367.0 367.0-900.0","alt_parent":-1,"alt_child":-1,"region_fraction":1,"stroke":"#053e4e"},{"id":7,"source":15,"source_time":1271.2734779542789,"target":7,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":9,"region_fraction":1,"stroke":"#053e4e"},{"id":8,"source":15,"source_time":1271.2734779542789,"target":9,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":7,"region_fraction":1,"stroke":"#053e4e"},{"id":9,"source":16,"source_time":1659.725091972838,"target":12,"target_time":1223.6271803895934,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":13,"region_fraction":1,"stroke":"#053e4e"},{"id":10,"source":16,"source_time":1659.725091972838,"target":13,"target_time":1246.4183161012481,"bounds":"367.0-900.0","alt_parent":19,"alt_child":12,"region_fraction":0.5922222222222222,"stroke":"#053e4e"},{"id":11,"source":17,"source_time":1961.7599532212648,"target":4,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":10,"region_fraction":1,"stroke":"#053e4e"},{"id":12,"source":17,"source_time":1961.7599532212648,"target":10,"target_time":311.25976965689426,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":4,"region_fraction":1,"stroke":"#053e4e"},{"id":13,"source":18,"source_time":4135.539932100284,"target":1,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":17,"region_fraction":1,"stroke":"#053e4e"},{"id":14,"source":18,"source_time":4135.539932100284,"target":17,"target_time":1961.7599532212648,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":1,"region_fraction":1,"stroke":"#053e4e"},{"id":15,"source":19,"source_time":6272.301763476431,"target":13,"target_time":1246.4183161012481,"bounds":"0.0-367.0","alt_parent":16,"alt_child":18,"region_fraction":0.4077777777777778,"stroke":"#053e4e"},{"id":16,"source":19,"source_time":6272.301763476431,"target":18,"target_time":4135.539932100284,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":13,"region_fraction":1,"stroke":"#053e4e"},{"id":17,"source":20,"source_time":7008.788482970518,"target":16,"target_time":1659.725091972838,"bounds":"0.0-600.0 600.0-900.0","alt_parent":-1,"alt_child":-1,"region_fraction":1,"stroke":"#053e4e"},{"id":18,"source":22,"source_time":8592.848809367675,"target":15,"target_time":1271.2734779542789,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":20,"region_fraction":1,"stroke":"#053e4e"},{"id":19,"source":22,"source_time":8592.848809367675,"target":20,"target_time":7008.788482970518,"bounds":"600.0-900.0","alt_parent":23,"alt_child":15,"region_fraction":0.3333333333333333,"stroke":"#053e4e"},{"id":20,"source":23,"source_time":10524.123281839178,"target":19,"target_time":6272.301763476431,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":20,"region_fraction":1,"stroke":"#053e4e"},{"id":21,"source":23,"source_time":10524.123281839178,"target":20,"target_time":7008.788482970518,"bounds":"0.0-600.0","alt_parent":22,"alt_child":19,"region_fraction":0.6666666666666666,"stroke":"#053e4e"},{"id":22,"source":24,"source_time":15165.157197158467,"target":22,"target_time":8592.848809367675,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":23,"region_fraction":1,"stroke":"#053e4e"},{"id":23,"source":24,"source_time":15165.157197158467,"target":23,"target_time":10524.123281839178,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":22,"region_fraction":1,"stroke":"#053e4e"}],"mutations":[],"breakpoints":[{"id":0,"start":0,"stop":367,"x_pos_01":0,"width_01":0.4077777777777778,"fill":"#053e4e","x_pos":100,"width":203.88888888888889,"included":true},{"id":1,"start":367,"stop":600,"x_pos_01":0.4077777777777778,"width_01":0.2588888888888889,"fill":"#053e4e","x_pos":303.8888888888889,"width":129.44444444444446,"included":true},{"id":2,"start":600,"stop":900,"x_pos_01":0.6666666666666666,"width_01":0.3333333333333333,"fill":"#053e4e","x_pos":433.3333333333333,"width":166.66666666666666,"included":true}],"evenly_distributed_positions":[150,194,239,283,328,372,417,461,506,550]},"width":600,"height":375,"y_axis":{"title":"Time ago (generations)","units":"generations","include_labels":true,"ticks":[250,234.6153846153846,219.23076923076923,203.84615384615384,188.46153846153845,173.0769230769231,157.69230769230768,142.3076923076923,126.92307692307692,111.53846153846155,96.15384615384615,80.76923076923075,65.38461538461537,50],"text":[0,311,488,1224,1246,1271,1660,1962,4136,6272,7009,8593,10524,15165],"max_min":[250,50],"scale":"rank"},"edges":{"type":"ortho","variable_width":false,"include_underlink":true},"condense_mutations":false,"label_mutations":false,"tree_highlighting":true,"rotate_tip_labels":false,"plot_type":"full","default_node_style":{"size":150,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":4},"title":null,"preamble":null,"save_filename":"tskit_arg_visualizer"} \ No newline at end of file diff --git a/data/whatis_example.trees b/data/whatis_example.trees index ea5f519b..71fd6915 100644 Binary files a/data/whatis_example.trees and b/data/whatis_example.trees differ diff --git a/data/whatis_example.yml b/data/whatis_example.yml index f172a1fc..caef6b39 100644 --- a/data/whatis_example.yml +++ b/data/whatis_example.yml @@ -8,9 +8,9 @@ - name: Ancestral_population epochs: - end_time: 1000 - - name: A + - name: P ancestors: [Ancestral_population] - - name: B + - name: Q ancestors: [Ancestral_population] epochs: - start_size: 2000 @@ -18,7 +18,7 @@ - start_size: 400 end_size: 10000 migrations: - - source: A - dest: B + - source: P + dest: Q rate: 1e-4 \ No newline at end of file diff --git a/getting_started.md b/getting_started.md index 1a529e5d..35ef3baf 100644 --- a/getting_started.md +++ b/getting_started.md @@ -18,13 +18,13 @@ kernelspec: # Getting started with {program}`tskit` You've run some simulations or inference methods, and you now have a -{class}`TreeSequence` object; what now? This tutorial is aimed +{class}`TreeSequence` object; what now? This tutorial is aimed at users who are new to {program}`tskit` and would like to get some basic tasks completed. We'll look at five fundamental things you might -need to do: +want to do: {ref}`process trees`, -{ref}`sites & mutations`, and -{ref}`genotypes`, +{ref}`process sites & mutations`, +{ref}`extract genotypes`, {ref}`compute statistics`, and {ref}`save or export data`. Throughout, we'll also provide pointers to where you can learn more. @@ -32,7 +32,7 @@ Throughout, we'll also provide pointers to where you can learn more. :::{note} The examples in this tutorial are all written using the {ref}`tskit:sec_python_api`, but it's also possible to -{ref}`use R `, or access the API in other languages, notably +{ref}`use R`, or access the API in other languages, notably {ref}`C` and [Rust](https://github.com/tskit-dev/tskit-rust). ::: @@ -43,7 +43,6 @@ twenty diploid individuals. To make it a bit more interesting, we'll simulate th of a {ref}`selective sweep` in the middle of the chromosome, then throw some neutral mutations onto the resulting tree sequence. - ```{code-cell} ipython3 import msprime @@ -59,9 +58,9 @@ ts = msprime.sim_ancestry( population_size=pop_size, sequence_length=seq_length, recombination_rate=1e-8, - random_seed=1234, # only needed for repeatabilty + random_seed=1234, # only needed for repeatability ) -# Optionally add finite-site mutations to the ts using the Jukes & Cantor model, creating SNPs +# Optionally add finite-site mutations to the ts using the default Jukes & Cantor model, creating SNPs ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=4321) ts ``` @@ -119,14 +118,14 @@ forwards through trees in a tree sequence (or indeed backwards using the standar {func}`~py:reversed` function) is efficient. That means it's quick, for example to check if all the trees in a tree sequence have fully coalesced (which is to be expected in reverse-time, coalescent simulations, but not always for tree sequences produced by -{ref}`forward simulation `). +{ref}`forward simulation`). ```{code-cell} ipython3 import time elapsed = time.time() for tree in ts.trees(): if tree.has_multiple_roots: - print("Tree {tree.index} has not coalesced") + print(f"Tree {tree.index} has not coalesced") break else: elapsed = time.time() - elapsed @@ -159,7 +158,7 @@ plt.show() It's obvious that there's something unusual about the trees in the middle of this chromosome, where the selective sweep occurred. -Although tskit is designed so that is it rapid to pass through trees sequentially, +Although tskit is designed so that it is rapid to pass through trees sequentially, it is also possible to pull out individual trees from the middle of a tree sequence via the {meth}`TreeSequence.at` method. Here's how you can use that to extract the tree at location $5\ 000\ 000$ --- the position of the sweep --- and draw it @@ -173,7 +172,7 @@ print(f"Tree number {swept_tree.index}, which runs from position {intvl.left} to swept_tree.draw_svg(size=(1000, 200)) ``` :::{margin} -The {ref}`visualization tutorial ` gives more drawing possibilities +The {ref}`visualization tutorial` gives more drawing possibilities ::: This tree shows the classic signature of a recent expansion or selection event, with many @@ -227,8 +226,8 @@ sites or mutations in your analyses. For many purposes it may be better to focus on the genealogy of your samples, rather than the {ref}`sites` and {ref}`mutations` that -{ref}`define ` the genome sequence itself. Nevertheless, -{program}`tskit` also provides efficient ways to return {class}`Site` object and +{ref}`define` the genome sequence itself. Nevertheless, +{program}`tskit` also provides efficient ways to return {class}`Site` and {class}`Mutation` objects from a tree sequence. For instance, under the finite sites model of mutation that we used above, multiple mutations can occur at some sites, and we can identify them by iterating over the sites using the @@ -250,13 +249,15 @@ for nmuts, count in enumerate(np.bincount(num_muts)): (sec_processing_genotypes)= -## Processing genotypes +## Extracting genotypes At each site, the sample nodes will have a particular allelic state (or be flagged as {ref}`tskit:sec_data_model_missing_data`). The -{meth}`TreeSequence.variants` method gives access to the -full variation data. For efficiency, the {attr}`~Variant.genotypes` -at a site are returned as a [numpy](https://numpy.org) array of integers: +{meth}`TreeSequence.variants` method gives access to +full variation data - allelic states and node genotypes at each site. +Since nodes are by definition haploid, their genotype at a site is simply the +allelic state, but for efficiency, the {attr}`~Variant.genotypes` attribute +at a site returns a [numpy](https://numpy.org) array of integers. ```{code-cell} ipython3 import numpy as np @@ -272,30 +273,27 @@ for v in ts.variants(): :::{note} Tree sequences are optimised to look at all samples at one site, then all samples at an -adjacent site, and so on along the genome. It is much less efficient look at all the +adjacent site, and so on along the genome. It is much less efficient to look at all the sites for a single sample, then all the sites for the next sample, etc. In other words, you should generally iterate over sites, not samples. Nevertheless, all the alleles for -a single sample can be obtained via the -{meth}`TreeSequence.haplotypes` method. +a single sample can be obtained via the inefficient {meth}`TreeSequence.haplotypes` method. +If a reference sequence is defined, ::: - To find the actual allelic states at a site, you can refer to the {attr}`~Variant.alleles` provided for each {class}`Variant`: the genotype value is an index into this list. Here's one way to print them out; for clarity this example also prints out the IDs of both the sample nodes (i.e. the genomes) -and the diploid {ref}`individuals ` in which each sample +and the diploid {ref}`individuals` in which each sample node resides. - - ````{code-cell} ipython3 samp_ids = ts.samples() print(" ID of diploid individual: ", " ".join([f"{ts.node(s).individual:3}" for s in samp_ids])) print(" ID of (sample) node: ", " ".join([f"{s:3}" for s in samp_ids])) for v in ts.variants(): site = v.site - alleles = np.array(v.alleles) + alleles = np.array(v.alleles) # NB, a shortcut is to use v.states() print(f"Site {site.id} (ancestral state '{site.ancestral_state}')", alleles[v.genotypes]) if site.id >= 4: # only print up to site ID 4 print("...") @@ -309,7 +307,6 @@ such as indels, leading to allelic states which need not be one of these 4 lette even be a single letter. ::: - (sec_tskit_getting_started_compute_statistics)= ## Computing statistics @@ -353,24 +350,24 @@ is the spectrum for a section of the tree sequence between 5 and 5.5Mb, which we created by deleting the regions outside that interval using {meth}`TreeSequence.keep_intervals`. Unsurprisingly, as we noted when looking at the trees, there's a far higher proportion of singletons in -the region of the sweep. +the region of the sweep compared to the entire genome. (sec_tskit_getting_started_compute_statistics_windowing)= ### Windowing It is often useful to see how statistics vary in different genomic regions. This is done -by calculating them in {ref}`tskit:sec_stats_windows` along the genome. For this, +by calculating them in {ref}`windows` along the genome. For this, let's look at a single statistic, the genetic {meth}`~TreeSequence.diversity` (π). As a -site statistic this measures the average number of genetic differences between two -randomly chosen samples, whereas as a branch length statistic it measures the average +site statistic, it measures the average number of genetic differences between two +randomly chosen samples, whereas as a branch statistic, it measures the average branch length between them. We'll plot how the value of π changes using 10kb windows, plotting the resulting diversity between positions 4 and 6 Mb: ```{code-cell} ipython3 fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 3)) L = int(ts.sequence_length) -windows = np.linspace(0, L, num=L//10_000) +windows = np.linspace(0, L, num=L//10_000 + 1) ax1.stairs(ts.diversity(windows=windows), windows/1_000, baseline=None) # Default is mode="site" ax1.set_ylabel("Diversity") ax1.set_xlabel("Genome position (kb)") @@ -380,15 +377,15 @@ ax1.set_yscale("log") ax1.set_ylim(1e-6, 1e-3) ax2.stairs(ts.diversity(windows=windows, mode="branch"), windows/1_000, baseline=None) ax2.set_xlabel("Genome position (kb)") -ax2.set_title("Branch-length-based calculation") +ax2.set_title("Branch-based calculation") ax2.set_xlim(4e3, 6e3) ax2.set_yscale("log") plt.show() ``` There's a clear drop in diversity in the region of the selective sweep. And as expected, -the statistic based on branch-lengths gives a less noisy signal. - +the statistic based on branch lengths gives a less noisy signal than the statistic based +on randomly occurring mutations at sites. (sec_tskit_getting_started_exporting_data)= @@ -434,7 +431,7 @@ print(small_ts.as_nexus(precision=3, include_alignments=False)) ### VCF -The standard way of interchanging genetic variation data is the Variant Call Format, +The standard way of interchanging genetic variation data is the Variant Call Format (VCF), for which tskit has basic support: ```{code-cell} ipython3 @@ -442,9 +439,8 @@ import sys small_ts.write_vcf(sys.stdout) ``` -The write_vcf method takes a file object as a parameter; to get it to write out to the -notebook here we ask it to write to stdout. - +The {meth}`TreeSequence.write_vcf` method takes a file object as a parameter; +to get it to write out to the notebook here we ask it to write to stdout. (sec_tskit_getting_started_exporting_data_allel)= @@ -466,23 +462,31 @@ print(h.n_variants, h.n_haplotypes) h ``` -Sckit.allel has a wide-ranging and efficient suite of tools for working with genotype -data, so should provide anything that's needed. For example, it gives us an -another way to compute the pairwise diversity statistic (that we calculated -{ref}`above` -using the native {meth}`TreeSequence.diversity` method): +{program}`scikit-allel` has a wide-ranging and efficient suite of tools for working +with genotype data for most users' needs. For example, it gives us an +another way to compute the pairwise diversity statistic: ```{code-cell} ipython3 ac = h.count_alleles() allel.mean_pairwise_difference(ac) ``` +Here's the equivalent using the native {meth}`TreeSequence.diversity` method, +{ref}`windowed ` by site: + +```{code-cell} ipython3 +# Per site (not span-normalised to match scikit-allel) +small_ts.diversity(windows="sites", span_normalise=False) +``` + +Note, however, that you would normally want to set `span_normalise=True` to account +for the fact that sites are not evenly spread along the genome (see {ref}`these docs`). (sec_tskit_getting_started_key_points)= ## Key points covered above -Some simple methods and take-home messages from this introduction to the +Here are the simple methods and take-home messages from this introduction to the {program}`tskit` {ref}`sec_python_api`, in rough order of importance: @@ -496,15 +500,15 @@ in rough order of importance: * {ref}`sec_terminology_nodes` (i.e. genomes) can belong to {ref}`individuals`. For example, sampling a diploid individual results in an {class}`Individual` object which - possesses two distinct {ref}`sample nodes`. + links to two distinct {ref}`sample nodes`. * Key tree sequence methods * {meth}`~TreeSequence.samples()` returns an array of node IDs specifying the nodes that are marked as samples * {meth}`~TreeSequence.node` returns the node object for a given integer node ID * {meth}`~TreeSequence.trees` iterates over all the trees * {meth}`~TreeSequence.sites` iterates over all the sites - * {meth}`~TreeSequence.variants` iterates over all the sites with their genotypes - and alleles + * {meth}`~TreeSequence.variants` iterates over all the sites with their list of + unique alleles and genotypes * {meth}`~TreeSequence.simplify()` reduces the number of sample nodes in the tree sequence to a specified subset * {meth}`~TreeSequence.keep_intervals()` (or its complement, @@ -519,4 +523,3 @@ in rough order of importance: sequence, for example {meth}`~TreeSequence.allele_frequency_spectrum`, {meth}`~TreeSequence.diversity`, and {meth}`~TreeSequence.Fst`; these can also be calculated in windows along the genome. - diff --git a/terminology_and_concepts.md b/terminology_and_concepts.md index 72c13cac..f1742114 100644 --- a/terminology_and_concepts.md +++ b/terminology_and_concepts.md @@ -48,8 +48,6 @@ def basics(): ) tables.mutations.time = np.full_like(tables.mutations.time, tskit.UNKNOWN_TIME) tables.tree_sequence().dump("data/basics.trees") - - def create_notebook_data(): basics() @@ -71,23 +69,24 @@ concepts behind {program}`tskit`, the tree sequence toolkit. ::::{margin} :::{note} -See {ref}`sec_intro_downloading_datafiles` to run this tutorial on your own computer +See {ref}`sec_intro_downloading_datafiles` to run this tutorial on your own computer. ::: :::: A tree sequence is a data structure which describes a set of correlated evolutionary trees, together with some associated data that specifies, for example, -the location of mutations in the tree sequence. More technically, a tree sequence -stores a biological structure known as an "Ancestral Recombination Graph", or ARG. +the location of mutations in the genome. More technically, it is a format that can +be used to store a variety of different population genetic structures that are loosely +known as ancestral recombination graphs (ARGs). -Below are the most important {ref}`terms and concepts ` -that you'll encounter in these tutorials, but first we'll {func}`~tskit.load` a tree -sequence from a `.trees` file using the +Below are the most important terms and concepts that you'll encounter in these tutorials. +A concise glossary of these terms and concepts is available at {ref}`here`. +But first we'll {func}`~tskit.load` a tree sequence from a `.trees` file using the {ref}`tskit:sec_python_api` (which will be used in the rest of this tutorial): ```{code-cell} ipython3 import tskit -# We will often store the python tree sequence object in a variable named "ts" +# We will often store the python tree sequence object in a variable named "ts" or "arg" ts = tskit.load("data/basics.trees") ``` @@ -97,7 +96,7 @@ ts = tskit.load("data/basics.trees") :::{note} {ref}`Workarounds` exist to represent a multi-chromosome genome as a tree -sequence, but are not covered here +sequence, but are not covered here. ::: :::: @@ -126,13 +125,12 @@ ts.draw_svg( ) ``` -Each tree records the lines of descent along which a piece of DNA has been -inherited (ignore for the moment the red symbols, which represent a mutation). +Each tree records the lines of descent along which a piece of DNA has been inherited. For example, the first tree tells us that DNA from ancestral genome 7 duplicated to produce two lineages, which ended up in genomes 1 and 4, both of which exist in the current population. In fact, since this pattern is seen in all trees, these particular lines of inheritance were taken by all the DNA in this 1000 base pair genome. - +The red symbol is a mutation, which we will describe later. (sec_terminology_nodes)= @@ -148,7 +146,6 @@ an *internal node*, representing an ancestor in which a single DNA sequence was duplicated (in forward-time terminology) or in which multiple sequences coalesced (in backwards-time terminology). - (sec_terminology_nodes_samples)= #### Sample nodes @@ -163,7 +160,6 @@ labelled $0..5$, and also 6 non-sample nodes, labelled $6..11$, in the tree sequ print("There are", ts.num_nodes, "nodes, of which", ts.num_samples, "are sample nodes") ``` - (sec_terminology_edges)= ### Edges @@ -175,10 +171,9 @@ three trees in the example above has a branch from node 7 to node 1, but those t branches represent just a single edge. Each edge is associated with a parent node ID and a child node ID. The time of the parent -node must be -strictly greater than the time of the child node, and the difference in these times is -sometimes referred to as the "length" of the edge. Since trees in a tree sequence are -usually taken to represent marginal trees along a genome, as well as the time dimension +node must be strictly greater than the time of the child node, and the difference in these times +is sometimes referred to as the "length" of the edge. Since trees in a tree sequence are +usually taken to represent local trees along a genome, as well as the time dimension each edge also has a genomic _span_, defined by a *left* and a *right* position along the genome. There are 15 edges in the tree sequence above. Here's an example of one of them: @@ -210,7 +205,7 @@ sample nodes, $0..5$ in the drawing, are present in all the trees (since we have full genomes), but the other nodes, such as node $9$, need not be: indeed in larger tree sequences they are rarely so. -In tree sequence terminology, we don't explictly keep track of where nodes +In tree sequence terminology, we don't explicitly keep track of where nodes start and end. Only edges (not nodes) possess a genomic span. So for example, this tree sequence is defined using edges like $(7\rightarrow1)$ which span the entire genome, edges like $(11\rightarrow10)$ which only span the leftmost section of the genome, and @@ -237,12 +232,9 @@ information about (say) node $7$, including the IDs of its parent and child node first_tree = ts.first() parent_of_7 = first_tree.parent(7) children_of_7 = first_tree.children(7) -print("Node 7's parent is", parent_of_7, "and childen are", children_of_7, "in the first tree") +print("Node 7's parent is", parent_of_7, "and children are", children_of_7, "in the first tree") ``` - - - (sec_terminology_individuals_and_populations)= ### Individuals and populations @@ -254,7 +246,7 @@ obtain two copies of each autosomal chromosome. The tree with six sample nodes a could therefore represent the result of sampling three diploid individuals from a larger population. The tree sequence can keep track of the individuals in which nodes reside, and store specific information about them (such as the individuals' spatial location) -as well as arbitrary {ref}`metadata ` (such as a name). In this particular +as well as arbitrary {ref}`metadata` (such as a name). In this particular tree sequence the sample nodes are indeed associated with three named diploid individuals: ``Ada``, ``Bob`` and ``Cat``. @@ -324,7 +316,7 @@ this case, site ID 0 (which happens to be at position 751): print(ts.site(0)) # For convenience, the Python API also returns the mutations at the site ``` -In the plot above, since the the only mutation is above node 8 in the last tree, and has +In the plot above, since the only mutation is above node 8 in the last tree, and has a {attr}`~Mutation.derived_state` of "G", we know that the samples descending from node 8 in the last tree (sample genomes 2, 3, and 5) have a "G" at {attr}`~Site.position` 751, while the others have the {attr}`~Site.ancestral_state` of "T". This means that Ada is @@ -333,6 +325,12 @@ In other words the ancestral state and the details of any mutations at that site when coupled with the tree topology at the site {attr}`~Site.position`, is sufficient to define the allelic state possessed by each sample. +{program}`Tskit` can also handle multiple mutations at a site. In this case, it can be +helpful to know the {attr}`~Mutation.inherited_state` for a mutation (i.e. the state at this +site before the mutation occurred). Often this might be the {attr}`~Site.ancestral_state`, +but it need not be if there are sequential mutations along a single path in a tree. In this +case the mutation will also be associated with the ID of a {attr}`~Mutation.parent` mutation. + Note that even though the genome is 1000 base pairs long, the tree sequence only contains a single site, because we usually only bother defining *variable* sites in a tree sequence (e.g. positions seen in studies to have samples possessing different alleles at @@ -340,7 +338,6 @@ that genomic location). It is perfectly possible to have a site with no mutation (or silent mutations) --- i.e. a "monomorphic" site --- but such sites are not normally used in further analysis. - (sec_terminology_provenance)= ### Provenance @@ -354,7 +351,6 @@ call to msprime that produced it, and the second the call to provenance entries are sufficient to exactly recreate the tree sequence, but this is not always possible. - (sec_concepts)= ## Concepts @@ -375,7 +371,7 @@ ts.draw_svg( Note that all these trees show strictly bifurcating splits, but this does not need to be the case for tree sequences in general. In particular, where tree sequences have been -created by the multiple-merger coalecent process, or in tree +created by the multiple-merger coalescent process, or in tree sequences that have been inferred from real data, it is possible to have a parent node with 3 or more children in a particular tree (these are known as *polytomies*). @@ -385,17 +381,18 @@ with 3 or more children in a particular tree (these are known as *polytomies*). ### Tree changes, ancestral recombinations, and SPRs The process of recombination usually results in trees along a genome where adjacent -trees differ by only a few "tree edit" or SPR (subtree-prune-and-regraft) operations. +trees differ by only a few "tree edit" or subtree-prune-and-regraft (SPR) operations. The result is a tree sequence in which very few edges {ref}`change from tree to tree`. -This is the underlying reason that `tskit` is so +This is the underlying reason that {program}`tskit` is so efficient, and is well illustrated in the example tree sequence above. In this (simulated) tree sequence, each tree differs from the next by a single SPR. -The subtree defined by node 7 in the first tree has been pruned and regrafted onto the -branch between 0 and 10, to create the second tree. The second and third trees have the -same topology, but differ because their ultimate coalesence happened in a different -ancestor (easy to spot in a simulation, but hard to detect in real data). This is also +The subtree defined by node 7 in the first tree has been pruned (away from node 11) and +regrafted onto the branch between 0 and 9, to create the second tree. +The second and third trees have the same topology, +but differ because their ultimate coalescence happened in a different ancestor +(easy to spot in a simulation, but hard to detect in real data). This is also caused by a single SPR: looking at the second tree, either the subtree below node 8 or the subtree below node 9 must have been pruned and regrafted higher up on the same lineage to create the third tree. Because this is a fully {ref}`simplified` @@ -404,28 +401,43 @@ know this, we would need to have a tree sequence with the exact history of recom recorded (see below). In general, each detectable recombination occurring in ancestral history results in a -single SPR in the tree sequence. If recombination breakpoints occurs at unique +single SPR in the tree sequence. If recombination breakpoints occur at unique positions (an "infinite sites" model of breakpoints), then the number of trees in a tree sequence equals the number of ancestral recombination events plus one. If recombinations can occur at the same physical position (e.g. if the genome is treated as a set of discrete integer positions, as in the simulation that created this tree sequence) then -moving from one tree to the next in a tree sequence might require multiple SPRs if +moving from one tree to the next in a tree sequence might require multiple SPRs if there are multiple, overlaid ancestral recombination events. +### Tables + +The underlying data in a tree sequence are stored in a collection of tables. Details of +these tables, as well as some basic properties of the tree sequence which they encode, +are shown when printing a tree sequence to the screen: + +```{code-cell} ipython3 +ts # or use `print(ts)` for the plain text representation +``` + +There is a separate API for dealing with a tree sequence as a collection of tables, +which can be useful for dealing with large datasets, and is needed if you want to +{ref}`edit` an existing tree sequence. This is covered in +the {ref}`sec_tables` tutorial. + (sec_concepts_args)= ### Tree sequences and ARGs ::::{margin} :::{note} -There is a subtle distinction between common ancestry and coalescence. In particular, all coalescent nodes are common ancestor events, but not all common ancestor events in an ARG result in coalescence in a local tree. +There is a subtle distinction between common ancestry and coalescence. In particular, all coalescent nodes are common ancestor events, but not all common ancestor events in an ARG result in coalescence in all local trees. ::: :::: -The term "Ancestral Recombination Graph", or ARG, is commonly used to describe a genetic +The term Ancestral Recombination Graph (ARG), is commonly used to describe a genetic genealogy. In particular, many (but not all) authors use it to mean a genetic genealogy in which details of the position and potentially the timing of all -recombination and common ancestor events are explictly stored. For clarity +recombination and common ancestor events are explicitly stored. For clarity we refer to this sort of genetic genealogy as a "full ARG". Succinct tree sequences can represent many different sorts of ARGs, including "full ARGs", by incorporating extra non-coalescent nodes (see the {ref}`sec_args` tutorial). However, tree sequences are @@ -438,11 +450,11 @@ which omits these extra nodes. This is for two main reasons: 2. The number of recombination and non-coalescing common ancestor events in the genealogy quickly grows to dominate the total number of nodes in the tree sequence, without actually contributing to the mutations inherited by the samples. - In other words, these nodes are redundant to the storing of genome data. + In other words, these nodes are redundant to the storing of genomic data. Therefore, compared to a full ARG, you can think of a simplified tree sequence as storing the trees *created by* recombination events, rather than attempting to record the -recombination events themselves. The actual recombination events can be sometimes be +recombination events themselves. The actual recombination events can sometimes be inferred from these trees but, as we have seen, it's not always possible. Here's another way to put it: @@ -450,20 +462,7 @@ way to put it: > whereas a [simplified] tree sequence encodes the outcome of those events" > ([Kelleher _et al._, 2019](https://doi.org/10.1534/genetics.120.303253)) - -### Tables - -The underlying data in a tree sequence are stored in a collection of tables. Details of -these tables, as well as some basic properties of the tree sequence which they encode, -are shown when printing a tree sequence to the screen: - -```{code-cell} ipython3 -ts # or use `print(ts)` for the plain text representation -``` - -There is a separate API for dealing with a tree sequence as a collection of tables, -which can be useful for dealing with large datasets, and is needed if you want to -{ref}`edit` an existing tree sequence. This is covered in -the {ref}`sec_tables` tutorial. +[Wong _et al._, 2024](https://doi.org/10.1093/genetics/iyae100) +review this topic in detail. diff --git a/viz.md b/viz.md index 2c806b92..6de6a5aa 100644 --- a/viz.md +++ b/viz.md @@ -2228,7 +2228,7 @@ def size_max(graph): # See https://popsim-consortium.github.io/demes-docs/ for the yml spec for the file below graph = demes.load("data/whatis_example.yml") w = 1.5 * size_max(graph) -positions = dict(Ancestral_population=0, A=-w, B=w) +positions = dict(Ancestral_population=0, P=-w, Q=w) ax = demesdraw.tubes(graph, positions=positions, seed=1) plt.show(ax.figure) ``` diff --git a/what_is.md b/what_is.md index faf89a6c..31747f34 100644 --- a/what_is.md +++ b/what_is.md @@ -13,8 +13,10 @@ kernelspec: ```{code-cell} ipython3 :tags: [remove-cell] -import msprime import demes +import msprime +import numpy as np +import tskit def whatis_example(): @@ -29,9 +31,9 @@ def whatis_example(): - name: Ancestral_population epochs: - end_time: 1000 - - name: A + - name: P ancestors: [Ancestral_population] - - name: B + - name: Q ancestors: [Ancestral_population] epochs: - start_size: 2000 @@ -39,8 +41,8 @@ def whatis_example(): - start_size: 400 end_size: 10000 migrations: - - source: A - dest: B + - source: P + dest: Q rate: 1e-4 """ with open("data/whatis_example.yml", "wt") as f: @@ -49,18 +51,51 @@ def whatis_example(): demography = msprime.Demography.from_demes(graph) # Choose seed so num_trees=3, tips are in same order, # first 2 trees are topologically different, and all trees have the same root - seed = 12581 - ts = msprime.sim_ancestry( - samples={"A": 2, "B": 3}, - demography=demography, - recombination_rate=1e-8, - sequence_length=1000, - random_seed=seed) + for seed in range(21964, 10000000): + ts = msprime.sim_ancestry( + samples={"P": 2, "Q": 3}, + demography=demography, + recombination_rate=1e-8, + sequence_length=900, + additional_nodes=( + msprime.NodeType.RECOMBINANT | + msprime.NodeType.COMMON_ANCESTOR + ), + coalescing_segments_only=False, + random_seed=seed) + if ts.num_trees != 3: + continue + if not len(set(tree.root for tree in ts.trees())) == 1: + continue + sts = ts.simplify() + if sts.num_trees != ts.num_trees: + continue + if sts.at_index(0).rank() == sts.at_index(1).rank(): + continue + if sts.at_index(1).rank() == sts.at_index(2).rank(): + continue + samps = [list(u for u in tree.nodes(order="minlex_postorder") if tree.is_sample(u)) for tree in sts.trees()] + if samps[0] != samps[1] or samps[0] != samps[2]: + continue + print(f"Used demography seed {seed}") + break # Mutate - # Choose seed to give 12 muts, last one above node 14 - seed = 1476 - ts = msprime.sim_mutations(ts, rate=1e-7, random_seed=seed) + # Choose seed to give 12 muts, last one above node 15 + for seed in range(273, 10000): + mts = msprime.sim_mutations(ts, rate=2e-7, random_seed=seed) + if mts.num_mutations != 12: + continue + if mts.mutation(-1).node != 15: + continue + print(f"Used mutations seed {seed}") + break + # put the mutations along midpoints of branches + tables = mts.dump_tables() + tables.mutations.time = np.full_like(tables.mutations.time, tskit.UNKNOWN_TIME) + ts = tables.tree_sequence() ts.dump("data/whatis_example.trees") + return ts + def create_notebook_data(): whatis_example() @@ -77,9 +112,11 @@ relationships between a set of DNA sequences. Tree sequences are based on fundam biological principles of inheritance, DNA duplication, mutation, and recombination; they can be created by [evolutionary simulation](https://tskit.dev/software/#simulate) or by [inferring genealogies from empirical DNA data](https://tskit.dev/software/#infer). +Technically, this means they +{ref}`can be used to represent "ancestral recombination graphs"`, or ARGs. :::{margin} Key point -Tree sequences are used to encode and analyse large genetic datasets +Tree sequences are used to encode and analyse large genetic datasets. ::: Tree sequences provide an efficient way of storing @@ -140,10 +177,15 @@ plt.show() ### A sequence of trees... As the name suggests, the simplest way to think about a tree sequence is that it -describes a sequence of correlated "local trees" --- i.e. genetic trees located at +describes a sequence of correlated "local trees" --- i.e. evolutionary trees located at different points along a [chromosome](https://en.wikipedia.org/wiki/Chromosome). -Here's a tiny example based on ten haploid genomes, $\mathrm{a}$ to $\mathrm{j}$, -spanning a short 1000 letter chromosome. + +Below is a tiny example based on 10 haploid copies of a short chromosome spanning only +900 basepairs (letters) in length. Each chromosome copy, which we can think of as a +small genome, is shown as a square symbol. These are labelled $\mathrm{a}$ to $\mathrm{j}$ +and represent "sampled" lengths of DNA, whose ancestry and genetic +variation is stored in the tree sequence. The tickmarks on the X axis and their +associated background shading indicate the genomic positions covered by the local trees. ```{code-cell} ipython3 :"tags": ["hide-input"] @@ -156,14 +198,23 @@ ts = mutated_ts.delete_sites(list(range(mutated_ts.num_sites))) labels = {i: string.ascii_lowercase[i] for i in range(ts.num_nodes)} genome_order = [n for n in ts.first().nodes(order="minlex_postorder") if ts.node(n).is_sample()] labels.update({n: labels[i] for i, n in enumerate(genome_order)}) +rev_lab = {v: k for k, v in labels.items()} style1 = ( ".node:not(.sample) > .sym, .node:not(.sample) > .lab {visibility: hidden;}" - ".mut {font-size: 12px} .y-axis .tick .lab {font-size: 85%}") + ".mut {font-size: 12px} .y-axis .tick .lab {font-size: 85%}" +) +highlt = ".t0 .n%(u)i .lab, .t1 .n%(u)i .lab {font-weight: bold; fill: red}" % {"u": rev_lab["e"]} + sz = (800, 250) # size of the plot, slightly larger than the default -ticks = [0, 5000, 10000, 15000, 20000] +ticks = [0, 5000, 10000, 15000] ts.draw_svg( - size=sz, node_labels=labels, style=style1, y_label="Time ago", - y_axis=True, y_ticks=ticks) + size=sz, + node_labels=labels, + style=style1 + highlt, + y_label="Time ago", + y_axis=True, + y_ticks=ticks, +) ``` ::::{margin} @@ -173,30 +224,52 @@ the nodes are referred to by {ref}`numerical ID`. ::: :::: -The tickmarks on the X axis and background shading indicate the genomic positions covered -by the trees. The tickmarks indicate recombination events that explain relationships -between the ten genomes. There were two such recombination events, giving us three local trees. -For the first short portion of the chromosome, from the start until position 189, + +For the first short portion of the chromosome, from the start until position 367, the relationships between the ten genomes are shown by the first tree. -The second tree shows the relationships between positions 189 and 546. -By inspecting the first and the second local tree we can see that genomes $\mathrm{b}-\mathrm{f}$ -changed their "most recent common ancestor" (MRCA) with genome $\mathrm{a}$ to -MRCA with genome $\mathrm{g}$. -The third tree shows the relationships between positions 546 and 1000 (the end). -By inspecting the second and the third local tree we can see that -recombination changed the ancestry of genomes $\mathrm{b}-\mathrm{f}$ -back to shared MRCA with genome $\mathrm{g}$. +The second tree shows the relationships between positions 367 and 600. +Note that in the first tree, genome $\mathrm{e}$ (highlighted in red) is closest to +$\mathrm{a}-\mathrm{d}$, whereas in the second tree $\mathrm{e}$ is closest to +$\mathrm{f}-\mathrm{h}$. The third tree shows the relationships between positions +600 and 900 (the end of the genome). In this case, an entire subtree (or "clade"), +composed of nodes $\mathrm{e}-\mathrm{h}$ has changed its relationship with the other six +genomes. More specifically, the most recent common ancestor (MRCA) with any of these +others has switched: in the third tree it is now above $\mathrm{i}$ and $\mathrm{j}$. (sec_what_is_genealogical_network)= ### ... created by a genealogical network -In fact, succinct tree sequences don't store each tree separately, but instead are -based on an interconnected *genetic genealogy*, in which -[genetic recombination](https://en.wikipedia.org/wiki/Genetic_recombination) has led +In fact, succinct tree sequences don't store each tree separately, but are instead +based on a network of *genetic ancestry*, in which +[genetic recombination](https://en.wikipedia.org/wiki/Genetic_recombination) +(represented by red symbols below) has led to different regions of the chromosome having different histories. Another way of -thinking about the tree sequence above is that it describes the full genetic ancestry -of our 10 genomes. +thinking about the tree sequence above is that it describes the full genetic genealogy +of our 10 genomes. If we combine all the relationships encoded in the trees (you can +loosely think of this as lying the trees on top of each other), the result is a network or +_graph_ (hence the term "[ARG](https://doi.org/10.1371/journal.pgen.1011110)") + +```{code-cell} ipython3 +:"tags": ["hide-input"] +import json +import tskit_arg_visualizer as argviz +d3arg = argviz.D3ARG.from_ts(ts=ts) +with open("data/whatis_example.json") as f: + d3arg.set_node_x_positions( + argviz.extract_x_positions_from_json(json.load(f)) + ) +d3arg.nodes.loc[:, 'label'] = "" +d3arg.nodes.loc[d3arg.nodes.ts_flags & msprime.NODE_IS_RE_EVENT != 0, 'fill'] = "red" +d3arg.set_node_labels({u: labels[u] for u in ts.samples()}) +d3arg.set_all_node_styles(size=50, stroke_width=3) +d3arg.set_node_styles({u: {"symbol": "d3.symbolSquare", "fill": "black"} for u in ts.samples()}) +d3arg.draw( + title="An interactive graph representation\n(hover over the bottom bar to see the trees)", + edge_type="ortho", + height=220); +``` + (sec_what_is_dna_data)= @@ -226,12 +299,12 @@ branches of the trees, and the positions of the twelve variable sites associated mutations are shown along the X axis. :::{margin} Key point -Mutation on trees are the source of genetic variation +Mutation on trees are the source of genetic variation. ::: -The trees inform us that, for example, the final mutation (at position 987) is inherited -by genomes $\mathrm{h}$ to $\mathrm{j}$. These genomes must have an *T* at that position, -compared to the original value of *G*. In other words, once we know the ancestry, placing +The trees inform us that, for example, the final mutation (at position 851) is inherited +by genomes $\mathrm{i}$ and $\mathrm{j}$. These genomes must have an *C* at that position, +compared to the original value of *A*. In other words, once we know the ancestry, placing a relatively small number of mutations is enough to explain all the observed genetic variation. Here's the resulting "variant matrix": @@ -266,7 +339,7 @@ ts.draw_svg( ``` :::{margin} Key point -Tree sequences are efficient because they don't store each tree separately +Tree sequences are efficient because they don't store each tree separately. ::: A branch can be shared by many adjacent trees, but is stored as a single edge in the tree @@ -312,14 +385,6 @@ plt.show() ## A record of genetic ancestry -::::{margin} -:::{note} -The genetic genealogy is sometimes referred to as an ancestral recombination graph, -or ARG, and one way to think of tskit tree sequence is as a way -to store various different sorts of ARGs (see the {ref}`ARG tutorial`) -::: -:::: - Often, we're not interested so much in the DNA sequence data as the genetic ancestry itself (discussed e.g. [here](https://www.nature.com/articles/s41588-019-0492-x)). In other words, the main consideration is the actual trees in a tree sequence, rather @@ -333,8 +398,8 @@ The tree sequence in this tutorial was actually generated using a model of popul splits and expansions as shown in the following schematic, {ref}`plotted` using the [DemesDraw](https://pypi.org/project/demesdraw/) package. Our 10 genomes were sampled -from modern day populations A (a constant-size population) and B (a recently expanding -one), where limited migration is occuring from A to B. +from modern day populations P (a constant-size population) and Q (a recently expanding +one), where limited migration is occuring from P to Q. ```{code-cell} ipython3 :"tags": ["remove-input"] @@ -352,7 +417,7 @@ def size_max(graph): graph = demes.load("data/whatis_example.yml") w = 1.5 * size_max(graph) -positions = dict(Ancestral_population=0, A=-w, B=w) +positions = dict(Ancestral_population=0, P=-w, Q=w) fig, ax = plt.subplots(1, figsize=(5, 3)) ax = demesdraw.tubes(graph, ax=ax, positions=positions, seed=1) plt.show(ax.figure) @@ -376,7 +441,7 @@ style2 = ".y-axis .tick .lab {font-size: 85%}" style2 += "#svg2 .node > .sym {visibility: visible;}" # force-show all nodes: not normally needed style2 += "".join([f".p{n.population} > .sym {{fill: {colours[n.population]}}}" for n in ts.nodes()]) -mutated_ts.draw_svg( +mutated_ts.simplify().draw_svg( size=sz, root_svg_attributes={'id':'svg2'}, y_label="Time ago (generations)", y_axis=True, y_ticks=ticks, node_labels=labels, mutation_labels={}, style=style2) ``` @@ -389,7 +454,7 @@ deduce these MRCA genomes, simply by looking at which mutations they have inheri ```{code-cell} ipython3 :"tags": ["hide-input"] import numpy as np -tables = mutated_ts.dump_tables() +tables = mutated_ts.simplify().dump_tables() # Flip sample and nonsample flags, making the haplotypes() method print out nonsample nodes s_flags = tables.nodes.flags[ts.samples()[0]] no_flags = s_flags-s_flags @@ -428,7 +493,7 @@ multiple correlated trees along a genome. ```{margin} Key point Most genetic calculations involve iterating over trees, which is highly efficient in -{program}`tskit` +{program}`tskit`. ``` For example, statistical measures of genetic variation can be thought of as a calculation @@ -466,7 +531,7 @@ The {program}`tskit` library has {ref}`extensive support` and {ref}`tree sequences`, as well as providing other -phylogenetically relevant methods such as +relevant methods such as principal component analysis, {ref}`parsimonious placement of mutations`, and the {ref}`counting of topologies` embedded within larger trees.