diff --git a/data/whatis_example.json b/data/whatis_example.json
new file mode 100644
index 00000000..29b15b13
--- /dev/null
+++ b/data/whatis_example.json
@@ -0,0 +1 @@
+{"data":{"nodes":[{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":0,"ts_flags":1,"time":0,"child_of":[10],"parent_of":[],"x_pos_reference":-1,"label":"a","fx":150,"fy":250,"y":250,"index":0,"x":150,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":1,"ts_flags":1,"time":0,"child_of":[18],"parent_of":[],"x_pos_reference":-1,"label":"d","fx":283,"fy":250,"y":250,"index":1,"x":283,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":2,"ts_flags":1,"time":0,"child_of":[13],"parent_of":[],"x_pos_reference":-1,"label":"e","fx":328,"fy":250,"y":250,"index":2,"x":328,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":3,"ts_flags":1,"time":0,"child_of":[10],"parent_of":[],"x_pos_reference":-1,"label":"b","fx":194,"fy":250,"y":250,"index":3,"x":194,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":4,"ts_flags":1,"time":0,"child_of":[17],"parent_of":[],"x_pos_reference":-1,"label":"c","fx":239,"fy":250,"y":250,"index":4,"x":239,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":5,"ts_flags":1,"time":0,"child_of":[12],"parent_of":[],"x_pos_reference":-1,"label":"f","fx":372,"fy":250,"y":250,"index":5,"x":372,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":6,"ts_flags":1,"time":0,"child_of":[11],"parent_of":[],"x_pos_reference":-1,"label":"g","fx":417,"fy":250,"y":250,"index":6,"x":417,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":7,"ts_flags":1,"time":0,"child_of":[15],"parent_of":[],"x_pos_reference":-1,"label":"i","fx":506,"fy":250,"y":250,"index":7,"x":506,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":8,"ts_flags":1,"time":0,"child_of":[11],"parent_of":[],"x_pos_reference":-1,"label":"h","fx":461,"fy":250,"y":250,"index":8,"x":461,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolSquare","fill":"black","stroke":"#053e4e","stroke_width":3,"id":9,"ts_flags":1,"time":0,"child_of":[15],"parent_of":[],"x_pos_reference":-1,"label":"j","fx":550,"fy":250,"y":250,"index":9,"x":550,"vy":0,"vx":0},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":10,"ts_flags":0,"time":311.25976965689426,"child_of":[17],"parent_of":[0,3],"x_pos_reference":-1,"label":"","x":172,"fy":234.6153846153846,"y":234.6153846153846,"index":10,"vy":0,"vx":0,"fx":172},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":11,"ts_flags":0,"time":487.8028220685495,"child_of":[12],"parent_of":[6,8],"x_pos_reference":-1,"label":"","x":437.77289828142125,"fy":219.23076923076923,"y":219.23076923076923,"index":11,"vy":0,"vx":0,"fx":437.77289828142125},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":12,"ts_flags":0,"time":1223.6271803895934,"child_of":[16],"parent_of":[5,11],"x_pos_reference":-1,"label":"","x":397.41637985695985,"fy":203.84615384615384,"y":203.84615384615384,"index":12,"vy":0,"vx":4.6619708093942336e-11,"fx":397.41637985695985},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":13,"ts_flags":131072,"time":1246.4183161012481,"child_of":[16,19],"parent_of":[2],"x_pos_reference":2,"label":"","x":328,"fy":188.46153846153845,"y":188.46153846153845,"index":13,"vy":0,"vx":0,"fx":328},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":15,"ts_flags":0,"time":1271.2734779542789,"child_of":[22],"parent_of":[7,9],"x_pos_reference":-1,"label":"","x":529.322755682009,"fy":173.0769230769231,"y":173.0769230769231,"index":14,"vy":0,"vx":8.158757003329243e-12,"fx":529.322755682009},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":16,"ts_flags":0,"time":1659.725091972838,"child_of":[20],"parent_of":[12,13],"x_pos_reference":-1,"label":"","x":373.31328138997435,"fy":157.69230769230768,"y":157.69230769230768,"index":15,"vy":0,"vx":-2.0449017479329257e-10,"fx":373.31328138997435},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":17,"ts_flags":0,"time":1961.7599532212648,"child_of":[18],"parent_of":[4,10],"x_pos_reference":-1,"label":"","x":194.38099308073106,"fy":142.3076923076923,"y":142.3076923076923,"index":16,"vy":0,"vx":6.79103440148765e-12,"fx":194.38099308073106},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":18,"ts_flags":0,"time":4135.539932100284,"child_of":[19],"parent_of":[1,17],"x_pos_reference":-1,"label":"","x":244.8923505399351,"fy":126.92307692307692,"y":126.92307692307692,"index":17,"vy":0,"vx":-8.945102991653186e-12,"fx":244.8923505399351},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":19,"ts_flags":0,"time":6272.301763476431,"child_of":[23],"parent_of":[13,18],"x_pos_reference":-1,"label":"","x":287.7165695894524,"fy":111.53846153846155,"y":111.53846153846155,"index":18,"vy":0,"vx":0,"fx":287.7165695894524},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":20,"ts_flags":131072,"time":7008.788482970518,"child_of":[22,23],"parent_of":[16],"x_pos_reference":16,"label":"","x":373.3132813901788,"fy":96.15384615384615,"y":96.15384615384615,"index":19,"vy":0,"vx":0,"fx":373.3132813901788},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":22,"ts_flags":0,"time":8592.848809367675,"child_of":[24],"parent_of":[15,20],"x_pos_reference":-1,"label":"","x":458.049985157943,"fy":80.76923076923075,"y":80.76923076923075,"index":20,"vy":0,"vx":8.623578517941155e-11,"fx":458.049985157943},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":23,"ts_flags":0,"time":10524.123281839178,"child_of":[24],"parent_of":[19,20],"x_pos_reference":-1,"label":"","x":326.7442210897065,"fy":65.38461538461537,"y":65.38461538461537,"index":21,"vy":0,"vx":0,"fx":326.7442210897065},{"size":50,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":3,"id":24,"ts_flags":0,"time":15165.157197158467,"child_of":[],"parent_of":[22,23],"x_pos_reference":-1,"label":"","x":397.8856001292887,"fy":50,"y":50,"index":22,"vy":0,"vx":8.095707437760779e-12,"fx":397.8856001292887}],"links":[{"id":0,"source":10,"source_time":311.25976965689426,"target":0,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":3,"region_fraction":1,"stroke":"#053e4e"},{"id":1,"source":10,"source_time":311.25976965689426,"target":3,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":0,"region_fraction":1,"stroke":"#053e4e"},{"id":2,"source":11,"source_time":487.8028220685495,"target":6,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":8,"region_fraction":1,"stroke":"#053e4e"},{"id":3,"source":11,"source_time":487.8028220685495,"target":8,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":6,"region_fraction":1,"stroke":"#053e4e"},{"id":4,"source":12,"source_time":1223.6271803895934,"target":5,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":11,"region_fraction":1,"stroke":"#053e4e"},{"id":5,"source":12,"source_time":1223.6271803895934,"target":11,"target_time":487.8028220685495,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":5,"region_fraction":1,"stroke":"#053e4e"},{"id":6,"source":13,"source_time":1246.4183161012481,"target":2,"target_time":0,"bounds":"0.0-367.0 367.0-900.0","alt_parent":-1,"alt_child":-1,"region_fraction":1,"stroke":"#053e4e"},{"id":7,"source":15,"source_time":1271.2734779542789,"target":7,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":9,"region_fraction":1,"stroke":"#053e4e"},{"id":8,"source":15,"source_time":1271.2734779542789,"target":9,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":7,"region_fraction":1,"stroke":"#053e4e"},{"id":9,"source":16,"source_time":1659.725091972838,"target":12,"target_time":1223.6271803895934,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":13,"region_fraction":1,"stroke":"#053e4e"},{"id":10,"source":16,"source_time":1659.725091972838,"target":13,"target_time":1246.4183161012481,"bounds":"367.0-900.0","alt_parent":19,"alt_child":12,"region_fraction":0.5922222222222222,"stroke":"#053e4e"},{"id":11,"source":17,"source_time":1961.7599532212648,"target":4,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":10,"region_fraction":1,"stroke":"#053e4e"},{"id":12,"source":17,"source_time":1961.7599532212648,"target":10,"target_time":311.25976965689426,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":4,"region_fraction":1,"stroke":"#053e4e"},{"id":13,"source":18,"source_time":4135.539932100284,"target":1,"target_time":0,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":17,"region_fraction":1,"stroke":"#053e4e"},{"id":14,"source":18,"source_time":4135.539932100284,"target":17,"target_time":1961.7599532212648,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":1,"region_fraction":1,"stroke":"#053e4e"},{"id":15,"source":19,"source_time":6272.301763476431,"target":13,"target_time":1246.4183161012481,"bounds":"0.0-367.0","alt_parent":16,"alt_child":18,"region_fraction":0.4077777777777778,"stroke":"#053e4e"},{"id":16,"source":19,"source_time":6272.301763476431,"target":18,"target_time":4135.539932100284,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":13,"region_fraction":1,"stroke":"#053e4e"},{"id":17,"source":20,"source_time":7008.788482970518,"target":16,"target_time":1659.725091972838,"bounds":"0.0-600.0 600.0-900.0","alt_parent":-1,"alt_child":-1,"region_fraction":1,"stroke":"#053e4e"},{"id":18,"source":22,"source_time":8592.848809367675,"target":15,"target_time":1271.2734779542789,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":20,"region_fraction":1,"stroke":"#053e4e"},{"id":19,"source":22,"source_time":8592.848809367675,"target":20,"target_time":7008.788482970518,"bounds":"600.0-900.0","alt_parent":23,"alt_child":15,"region_fraction":0.3333333333333333,"stroke":"#053e4e"},{"id":20,"source":23,"source_time":10524.123281839178,"target":19,"target_time":6272.301763476431,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":20,"region_fraction":1,"stroke":"#053e4e"},{"id":21,"source":23,"source_time":10524.123281839178,"target":20,"target_time":7008.788482970518,"bounds":"0.0-600.0","alt_parent":22,"alt_child":19,"region_fraction":0.6666666666666666,"stroke":"#053e4e"},{"id":22,"source":24,"source_time":15165.157197158467,"target":22,"target_time":8592.848809367675,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":23,"region_fraction":1,"stroke":"#053e4e"},{"id":23,"source":24,"source_time":15165.157197158467,"target":23,"target_time":10524.123281839178,"bounds":"0.0-900.0","alt_parent":-1,"alt_child":22,"region_fraction":1,"stroke":"#053e4e"}],"mutations":[],"breakpoints":[{"id":0,"start":0,"stop":367,"x_pos_01":0,"width_01":0.4077777777777778,"fill":"#053e4e","x_pos":100,"width":203.88888888888889,"included":true},{"id":1,"start":367,"stop":600,"x_pos_01":0.4077777777777778,"width_01":0.2588888888888889,"fill":"#053e4e","x_pos":303.8888888888889,"width":129.44444444444446,"included":true},{"id":2,"start":600,"stop":900,"x_pos_01":0.6666666666666666,"width_01":0.3333333333333333,"fill":"#053e4e","x_pos":433.3333333333333,"width":166.66666666666666,"included":true}],"evenly_distributed_positions":[150,194,239,283,328,372,417,461,506,550]},"width":600,"height":375,"y_axis":{"title":"Time ago (generations)","units":"generations","include_labels":true,"ticks":[250,234.6153846153846,219.23076923076923,203.84615384615384,188.46153846153845,173.0769230769231,157.69230769230768,142.3076923076923,126.92307692307692,111.53846153846155,96.15384615384615,80.76923076923075,65.38461538461537,50],"text":[0,311,488,1224,1246,1271,1660,1962,4136,6272,7009,8593,10524,15165],"max_min":[250,50],"scale":"rank"},"edges":{"type":"ortho","variable_width":false,"include_underlink":true},"condense_mutations":false,"label_mutations":false,"tree_highlighting":true,"rotate_tip_labels":false,"plot_type":"full","default_node_style":{"size":150,"symbol":"d3.symbolCircle","fill":"#1eebb1","stroke":"#053e4e","stroke_width":4},"title":null,"preamble":null,"save_filename":"tskit_arg_visualizer"}
\ No newline at end of file
diff --git a/data/whatis_example.trees b/data/whatis_example.trees
index ea5f519b..71fd6915 100644
Binary files a/data/whatis_example.trees and b/data/whatis_example.trees differ
diff --git a/data/whatis_example.yml b/data/whatis_example.yml
index f172a1fc..caef6b39 100644
--- a/data/whatis_example.yml
+++ b/data/whatis_example.yml
@@ -8,9 +8,9 @@
           - name: Ancestral_population
             epochs:
               - end_time: 1000
-          - name: A
+          - name: P
             ancestors: [Ancestral_population]
-          - name: B
+          - name: Q
             ancestors: [Ancestral_population]
             epochs:
               - start_size: 2000
@@ -18,7 +18,7 @@
               - start_size: 400
                 end_size: 10000
         migrations:
-          - source: A
-            dest: B
+          - source: P
+            dest: Q
             rate: 1e-4
         
\ No newline at end of file
diff --git a/getting_started.md b/getting_started.md
index 1a529e5d..35ef3baf 100644
--- a/getting_started.md
+++ b/getting_started.md
@@ -18,13 +18,13 @@ kernelspec:
 # Getting started with {program}`tskit`
 
 You've run some simulations or inference methods, and you now have a 
-{class}`TreeSequence` object; what now? This tutorial is aimed 
+{class}`TreeSequence` object; what now? This tutorial is aimed at
 users who are new to {program}`tskit` and would like to get some
 basic tasks completed. We'll look at five fundamental things you might
-need to do:
+want to do:
 {ref}`process trees<sec_processing_trees>`,
-{ref}`sites & mutations<sec_processing_sites_and_mutations>`, and
-{ref}`genotypes<sec_processing_genotypes>`,
+{ref}`process sites & mutations<sec_processing_sites_and_mutations>`,
+{ref}`extract genotypes<sec_processing_genotypes>`,
 {ref}`compute statistics<sec_tskit_getting_started_compute_statistics>`, and
 {ref}`save or export data<sec_tskit_getting_started_exporting_data>`.
 Throughout, we'll also provide pointers to where you can learn more.
@@ -32,7 +32,7 @@ Throughout, we'll also provide pointers to where you can learn more.
 :::{note}
 The examples in this
 tutorial are all written using the {ref}`tskit:sec_python_api`, but it's also possible to
-{ref}`use R <sec_tskit_r>`, or access the API in other languages, notably
+{ref}`use R<sec_tskit_r>`, or access the API in other languages, notably
 {ref}`C<sec_c_api>` and [Rust](https://github.com/tskit-dev/tskit-rust).
 :::
 
@@ -43,7 +43,6 @@ twenty diploid individuals. To make it a bit more interesting, we'll simulate th
 of a {ref}`selective sweep<msprime:sec_ancestry_models_selective_sweeps>` in the middle
 of the chromosome, then throw some neutral mutations onto the resulting tree sequence.
 
-
 ```{code-cell} ipython3
 import msprime
 
@@ -59,9 +58,9 @@ ts = msprime.sim_ancestry(
     population_size=pop_size,
     sequence_length=seq_length,
     recombination_rate=1e-8,
-    random_seed=1234,  # only needed for repeatabilty
+    random_seed=1234,  # only needed for repeatability
     )
-# Optionally add finite-site mutations to the ts using the Jukes & Cantor model, creating SNPs
+# Optionally add finite-site mutations to the ts using the default Jukes & Cantor model, creating SNPs
 ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=4321)
 ts
 ```
@@ -119,14 +118,14 @@ forwards through trees in a tree sequence (or indeed backwards using the standar
 {func}`~py:reversed` function) is efficient. That means it's quick, for example to check if all
 the trees in a tree sequence have fully coalesced (which is to be expected in
 reverse-time, coalescent simulations, but not always for tree sequences produced by
-{ref}`forward simulation <sec_tskit_forward_simulations>`).
+{ref}`forward simulation<sec_tskit_forward_simulations>`).
 
 ```{code-cell} ipython3
 import time
 elapsed = time.time()
 for tree in ts.trees():
     if tree.has_multiple_roots:
-        print("Tree {tree.index} has not coalesced")
+        print(f"Tree {tree.index} has not coalesced")
         break
 else:
     elapsed = time.time() - elapsed
@@ -159,7 +158,7 @@ plt.show()
 It's obvious that there's something unusual about the trees in the middle of this
 chromosome, where the selective sweep occurred. 
 
-Although tskit is designed so that is it rapid to pass through trees sequentially,
+Although tskit is designed so that it is rapid to pass through trees sequentially,
 it is also possible to pull out individual trees from the middle of a tree sequence
 via the {meth}`TreeSequence.at` method. Here's how you can use that to extract
 the tree at location $5\ 000\ 000$ --- the position of the sweep --- and draw it
@@ -173,7 +172,7 @@ print(f"Tree number {swept_tree.index}, which runs from position {intvl.left} to
 swept_tree.draw_svg(size=(1000, 200))
 ```
 :::{margin}
-The {ref}`visualization tutorial <sec_tskit_viz>` gives more drawing possibilities
+The {ref}`visualization tutorial<sec_tskit_viz>` gives more drawing possibilities
 :::
 
 This tree shows the classic signature of a recent expansion or selection event, with many
@@ -227,8 +226,8 @@ sites or mutations in your analyses.
 For many purposes it may be better to focus on the genealogy of your samples, rather than
 the {ref}`sites<sec_data_model_definitions_site>` and
 {ref}`mutations<sec_data_model_definitions_mutation>` that
-{ref}`define <sec_what_is_dna_data>` the genome sequence itself. Nevertheless,
-{program}`tskit` also provides efficient ways to return {class}`Site` object and
+{ref}`define<sec_what_is_dna_data>` the genome sequence itself. Nevertheless,
+{program}`tskit` also provides efficient ways to return {class}`Site` and
 {class}`Mutation` objects from a tree sequence.
 For instance, under the finite sites model of mutation that we used above, multiple mutations
 can occur at some sites, and we can identify them by iterating over the sites using the
@@ -250,13 +249,15 @@ for nmuts, count in enumerate(np.bincount(num_muts)):
 
 (sec_processing_genotypes)=
 
-## Processing genotypes
+## Extracting genotypes
 
 At each site, the sample nodes will have a particular allelic state (or be flagged as
 {ref}`tskit:sec_data_model_missing_data`). The
-{meth}`TreeSequence.variants` method gives access to the
-full variation data. For efficiency, the {attr}`~Variant.genotypes`
-at a site are returned as a [numpy](https://numpy.org) array of integers:
+{meth}`TreeSequence.variants` method gives access to
+full variation data - allelic states and node genotypes at each site.
+Since nodes are by definition haploid, their genotype at a site is simply the
+allelic state, but for efficiency, the {attr}`~Variant.genotypes` attribute
+at a site returns a [numpy](https://numpy.org) array of integers.
 
 ```{code-cell} ipython3
 import numpy as np
@@ -272,30 +273,27 @@ for v in ts.variants():
 
 :::{note}
 Tree sequences are optimised to look at all samples at one site, then all samples at an
-adjacent site, and so on along the genome. It is much less efficient look at all the
+adjacent site, and so on along the genome. It is much less efficient to look at all the
 sites for a single sample, then all the sites for the next sample, etc. In other words,
 you should generally iterate over sites, not samples. Nevertheless, all the alleles for
-a single sample can be obtained via the
-{meth}`TreeSequence.haplotypes` method.
+a single sample can be obtained via the inefficient {meth}`TreeSequence.haplotypes` method.
+If a reference sequence is defined, 
 :::
 
-
 To find the actual allelic states at a site, you can refer to the
 {attr}`~Variant.alleles` provided for each {class}`Variant`:
 the genotype value is an index into this list. Here's one way to print them out; for
 clarity this example also prints out the IDs of both the sample nodes (i.e. the genomes)
-and the diploid {ref}`individuals <sec_nodes_or_individuals>` in which each sample
+and the diploid {ref}`individuals<sec_nodes_or_individuals>` in which each sample
 node resides.
 
-
-
 ````{code-cell} ipython3
 samp_ids = ts.samples()
 print("  ID of diploid individual: ", " ".join([f"{ts.node(s).individual:3}" for s in samp_ids]))
 print("       ID of (sample) node: ", " ".join([f"{s:3}" for s in samp_ids]))
 for v in ts.variants():
     site = v.site
-    alleles = np.array(v.alleles)
+    alleles = np.array(v.alleles)  # NB, a shortcut is to use v.states()
     print(f"Site {site.id} (ancestral state '{site.ancestral_state}')",  alleles[v.genotypes])
     if site.id >= 4:  # only print up to site ID 4
         print("...")
@@ -309,7 +307,6 @@ such as indels, leading to allelic states which need not be one of these 4 lette
 even be a single letter.
 :::
 
-
 (sec_tskit_getting_started_compute_statistics)=
 
 ## Computing statistics
@@ -353,24 +350,24 @@ is the spectrum for a section of the tree sequence between 5 and 5.5Mb, which we
 created by deleting the regions outside that interval using
 {meth}`TreeSequence.keep_intervals`. Unsurprisingly,
 as we noted when looking at the trees, there's a far higher proportion of singletons in
-the region of the sweep.
+the region of the sweep compared to the entire genome.
 
 (sec_tskit_getting_started_compute_statistics_windowing)=
 
 ### Windowing
 
 It is often useful to see how statistics vary in different genomic regions. This is done
-by calculating them in {ref}`tskit:sec_stats_windows` along the genome. For this,
+by calculating them in {ref}`windows<tskit:sec_stats_windows>` along the genome. For this,
 let's look at a single statistic, the genetic {meth}`~TreeSequence.diversity` (π). As a
-site statistic this measures the average number of genetic differences between two
-randomly chosen samples, whereas as a branch length statistic it measures the average
+site statistic, it measures the average number of genetic differences between two
+randomly chosen samples, whereas as a branch statistic, it measures the average
 branch length between them. We'll plot how the value of π changes using 10kb windows,
 plotting the resulting diversity between positions 4 and 6 Mb:
 
 ```{code-cell} ipython3
 fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 3))
 L = int(ts.sequence_length)
-windows = np.linspace(0, L, num=L//10_000)
+windows = np.linspace(0, L, num=L//10_000 + 1)
 ax1.stairs(ts.diversity(windows=windows), windows/1_000, baseline=None)  # Default is mode="site"
 ax1.set_ylabel("Diversity")
 ax1.set_xlabel("Genome position (kb)")
@@ -380,15 +377,15 @@ ax1.set_yscale("log")
 ax1.set_ylim(1e-6, 1e-3)
 ax2.stairs(ts.diversity(windows=windows, mode="branch"), windows/1_000, baseline=None)
 ax2.set_xlabel("Genome position (kb)")
-ax2.set_title("Branch-length-based calculation")
+ax2.set_title("Branch-based calculation")
 ax2.set_xlim(4e3, 6e3)
 ax2.set_yscale("log")
 plt.show()
 ```
 
 There's a clear drop in diversity in the region of the selective sweep. And as expected,
-the statistic based on branch-lengths gives a less noisy signal.
-
+the statistic based on branch lengths gives a less noisy signal than the statistic based
+on randomly occurring mutations at sites.
 
 (sec_tskit_getting_started_exporting_data)=
 
@@ -434,7 +431,7 @@ print(small_ts.as_nexus(precision=3, include_alignments=False))
 
 ### VCF
 
-The standard way of interchanging genetic variation data is the Variant Call Format, 
+The standard way of interchanging genetic variation data is the Variant Call Format (VCF), 
 for which tskit has basic support:
 
 ```{code-cell} ipython3
@@ -442,9 +439,8 @@ import sys
 small_ts.write_vcf(sys.stdout)
 ```
 
-The write_vcf method takes a file object as a parameter; to get it to write out to the
-notebook here we ask it to write to stdout.
-
+The {meth}`TreeSequence.write_vcf` method takes a file object as a parameter;
+to get it to write out to the notebook here we ask it to write to stdout.
 
 (sec_tskit_getting_started_exporting_data_allel)=
 
@@ -466,23 +462,31 @@ print(h.n_variants, h.n_haplotypes)
 h
 ```
 
-Sckit.allel has a wide-ranging and efficient suite of tools for working with genotype
-data, so should provide anything that's needed. For example, it gives us an
-another way to compute the pairwise diversity statistic (that we calculated
-{ref}`above<sec_tskit_getting_started_compute_statistics_windowing>`
-using the native {meth}`TreeSequence.diversity` method):
+{program}`scikit-allel` has a wide-ranging and efficient suite of tools for working
+with genotype data for most users' needs. For example, it gives us an
+another way to compute the pairwise diversity statistic:
 
 ```{code-cell} ipython3
 ac = h.count_alleles()
 allel.mean_pairwise_difference(ac)
 ```
 
+Here's the equivalent using the native {meth}`TreeSequence.diversity` method,
+{ref}`windowed <tskit:sec_stats_windows>` by site:
+
+```{code-cell} ipython3
+# Per site (not span-normalised to match scikit-allel)
+small_ts.diversity(windows="sites", span_normalise=False)
+```
+
+Note, however, that you would normally want to set `span_normalise=True`  to account
+for the fact that sites are not evenly spread along the genome (see {ref}`these docs<tskit:sec_stats_span_normalise>`).
 
 (sec_tskit_getting_started_key_points)=
 
 ## Key points covered above
 
-Some simple methods and take-home messages from this introduction to the
+Here are the simple methods and take-home messages from this introduction to the
 {program}`tskit` {ref}`sec_python_api`,
 in rough order of importance:
 
@@ -496,15 +500,15 @@ in rough order of importance:
     * {ref}`sec_terminology_nodes` (i.e. genomes) can belong to
         {ref}`individuals<sec_terminology_individuals_and_populations>`. For example,
         sampling a diploid individual results in an {class}`Individual` object which
-        possesses two distinct {ref}`sample nodes<sec_terminology_nodes_samples>`.
+        links to two distinct {ref}`sample nodes<sec_terminology_nodes_samples>`.
 * Key tree sequence methods
     * {meth}`~TreeSequence.samples()` returns an array of node IDs specifying the
         nodes that are marked as samples
     * {meth}`~TreeSequence.node` returns the node object for a given integer node ID
     * {meth}`~TreeSequence.trees` iterates over all the trees
     * {meth}`~TreeSequence.sites` iterates over all the sites
-    * {meth}`~TreeSequence.variants` iterates over all the sites with their genotypes
-        and alleles
+    * {meth}`~TreeSequence.variants` iterates over all the sites with their list of
+        unique alleles and genotypes
     * {meth}`~TreeSequence.simplify()` reduces the number of sample nodes in the tree
         sequence to a specified subset
     * {meth}`~TreeSequence.keep_intervals()` (or its complement,
@@ -519,4 +523,3 @@ in rough order of importance:
         sequence, for example {meth}`~TreeSequence.allele_frequency_spectrum`,
         {meth}`~TreeSequence.diversity`, and {meth}`~TreeSequence.Fst`; these can
         also be calculated in windows along the genome.
-
diff --git a/terminology_and_concepts.md b/terminology_and_concepts.md
index 72c13cac..f1742114 100644
--- a/terminology_and_concepts.md
+++ b/terminology_and_concepts.md
@@ -48,8 +48,6 @@ def basics():
     )
     tables.mutations.time = np.full_like(tables.mutations.time, tskit.UNKNOWN_TIME)
     tables.tree_sequence().dump("data/basics.trees")
-    
-
 
 def create_notebook_data():
     basics()
@@ -71,23 +69,24 @@ concepts behind {program}`tskit`, the tree sequence toolkit.
 
 ::::{margin}
 :::{note}
-See {ref}`sec_intro_downloading_datafiles` to run this tutorial on your own computer
+See {ref}`sec_intro_downloading_datafiles` to run this tutorial on your own computer.
 :::
 ::::
 
 A tree sequence is a data structure which describes a set of correlated
 evolutionary trees, together with some associated data that specifies, for example,
-the location of mutations in the tree sequence. More technically, a tree sequence
-stores a biological structure known as an "Ancestral Recombination Graph", or ARG.
+the location of mutations in the genome. More technically, it is a format that can
+be used to store a variety of different population genetic structures that are loosely
+known as ancestral recombination graphs (ARGs).
 
-Below are the most important {ref}`terms and concepts <tskit:sec_data_model_definitions>`
-that you'll encounter in these tutorials,  but first we'll {func}`~tskit.load` a tree
-sequence from a `.trees` file using the
+Below are the most important terms and concepts that you'll encounter in these tutorials.
+A concise glossary of these terms and concepts is available at {ref}`here<tskit:sec_data_model_definitions>`.
+But first we'll {func}`~tskit.load` a tree sequence from a `.trees` file using the
 {ref}`tskit:sec_python_api` (which will be used in the rest of this tutorial):
 
 ```{code-cell} ipython3
 import tskit
-# We will often store the python tree sequence object in a variable named "ts"
+# We will often store the python tree sequence object in a variable named "ts" or "arg"
 ts = tskit.load("data/basics.trees")
 ```
 
@@ -97,7 +96,7 @@ ts = tskit.load("data/basics.trees")
 :::{note}
 {ref}`Workarounds<msprime:sec_ancestry_multiple_chromosomes>` exist
 to represent a multi-chromosome genome as a tree
-sequence, but are not covered here
+sequence, but are not covered here.
 :::
 ::::
 
@@ -126,13 +125,12 @@ ts.draw_svg(
 )
 ```
 
-Each tree records the lines of descent along which a piece of DNA has been
-inherited (ignore for the moment the red symbols, which represent a mutation).
+Each tree records the lines of descent along which a piece of DNA has been inherited.
 For example, the first tree tells us that DNA from ancestral genome 7 duplicated
 to produce two lineages, which ended up in genomes 1 and 4, both of which exist in the
 current population. In fact, since this pattern is seen in all trees, these particular
 lines of inheritance were taken by all the DNA in this 1000 base pair genome.
-
+The red symbol is a mutation, which we will describe later.
 
 (sec_terminology_nodes)=
 
@@ -148,7 +146,6 @@ an *internal node*, representing an ancestor in which a single DNA
 sequence was duplicated (in forward-time terminology) or in which multiple sequences
 coalesced (in backwards-time terminology). 
 
-
 (sec_terminology_nodes_samples)=
 
 #### Sample nodes
@@ -163,7 +160,6 @@ labelled $0..5$, and also 6 non-sample nodes, labelled $6..11$, in the tree sequ
 print("There are", ts.num_nodes, "nodes, of which", ts.num_samples, "are sample nodes")
 ```
 
-
 (sec_terminology_edges)=
 
 ### Edges
@@ -175,10 +171,9 @@ three trees in the example above has a branch from node 7 to node 1, but those t
 branches represent just a single edge.
 
 Each edge is associated with a parent node ID and a child node ID. The time of the parent
-node must be
-strictly greater than the time of the child node, and the difference in these times is
-sometimes referred to as the "length" of the edge. Since trees in a tree sequence are
-usually taken to represent marginal trees along a genome, as well as the time dimension
+node must be strictly greater than the time of the child node, and the difference in these times
+is sometimes referred to as the "length" of the edge. Since trees in a tree sequence are
+usually taken to represent local trees along a genome, as well as the time dimension
 each edge also has a genomic _span_, defined by a *left* and a *right* position
 along the genome. There are 15 edges in the tree sequence above. Here's an example of
 one of them:
@@ -210,7 +205,7 @@ sample nodes, $0..5$ in the drawing, are present in all the trees (since we have
 full genomes), but the other nodes, such as node $9$, need not be: indeed in larger
 tree sequences they are rarely so.
 
-In tree sequence terminology, we don't explictly keep track of where nodes
+In tree sequence terminology, we don't explicitly keep track of where nodes
 start and end. Only edges (not nodes) possess a genomic span. So for example, this tree
 sequence is defined using edges like $(7\rightarrow1)$ which span the entire genome,
 edges like $(11\rightarrow10)$ which only span the leftmost section of the genome, and
@@ -237,12 +232,9 @@ information about (say) node $7$, including the IDs of its parent and child node
 first_tree = ts.first()
 parent_of_7 = first_tree.parent(7)
 children_of_7 = first_tree.children(7)
-print("Node 7's parent is", parent_of_7, "and childen are", children_of_7, "in the first tree")
+print("Node 7's parent is", parent_of_7, "and children are", children_of_7, "in the first tree")
 ```
 
-
-
-
 (sec_terminology_individuals_and_populations)=
 
 ### Individuals and populations
@@ -254,7 +246,7 @@ obtain two copies of each autosomal chromosome. The tree with six sample nodes a
 could therefore represent the result of sampling three diploid individuals from a larger
 population. The tree sequence can keep track of the individuals in which nodes reside,
 and store specific information about them (such as the individuals' spatial location)
-as well as arbitrary {ref}`metadata <sec_metadata>` (such as a name). In this particular
+as well as arbitrary {ref}`metadata<sec_metadata>` (such as a name). In this particular
 tree sequence the sample nodes are indeed associated with three named diploid
 individuals: ``Ada``, ``Bob`` and ``Cat``.
 
@@ -324,7 +316,7 @@ this case, site ID 0 (which happens to be at position 751):
 print(ts.site(0))  # For convenience, the Python API also returns the mutations at the site
 ```
 
-In the plot above, since the the only mutation is above node 8 in the last tree, and has
+In the plot above, since the only mutation is above node 8 in the last tree, and has
 a {attr}`~Mutation.derived_state` of "G", we know that the samples descending from node
 8 in the last tree (sample genomes 2, 3, and 5) have a "G" at {attr}`~Site.position` 751,
 while the others have the {attr}`~Site.ancestral_state` of "T". This means that Ada is
@@ -333,6 +325,12 @@ In other words the ancestral state and the details of any mutations at that site
 when coupled with the tree topology at the site {attr}`~Site.position`, is sufficient to
 define the allelic state possessed by each sample.
 
+{program}`Tskit` can also handle multiple mutations at a site. In this case, it can be
+helpful to know the {attr}`~Mutation.inherited_state` for a mutation (i.e. the state at this
+site before the mutation occurred). Often this might be the {attr}`~Site.ancestral_state`,
+but it need not be if there are sequential mutations along a single path in a tree. In this
+case the mutation will also be associated with the ID of a {attr}`~Mutation.parent` mutation.
+
 Note that even though the genome is 1000 base pairs long, the tree sequence only contains
 a single site, because we usually only bother defining *variable* sites in a tree
 sequence (e.g. positions seen in studies to have samples possessing different alleles at
@@ -340,7 +338,6 @@ that genomic location). It is perfectly possible to have a site with no mutation
 (or silent mutations) --- i.e. a "monomorphic" site --- but such sites are not normally
 used in further analysis.
 
-
 (sec_terminology_provenance)=
 
 ### Provenance
@@ -354,7 +351,6 @@ call to msprime that produced it, and the second the call to
 provenance entries are sufficient to exactly recreate the tree sequence, but this
 is not always possible.
 
-
 (sec_concepts)=
 
 ## Concepts
@@ -375,7 +371,7 @@ ts.draw_svg(
 
 Note that all these trees show strictly bifurcating splits, but this does not need to be
 the case for tree sequences in general. In particular, where tree sequences have been
-created by the multiple-merger coalecent process, or in tree
+created by the multiple-merger coalescent process, or in tree
 sequences that have been inferred from real data, it is possible to have a parent node
 with 3 or more children in a particular tree (these are known as *polytomies*).
 
@@ -385,17 +381,18 @@ with 3 or more children in a particular tree (these are known as *polytomies*).
 ### Tree changes, ancestral recombinations, and SPRs
 
 The process of recombination usually results in trees along a genome where adjacent
-trees differ by only a few "tree edit" or SPR (subtree-prune-and-regraft) operations.
+trees differ by only a few "tree edit" or subtree-prune-and-regraft (SPR) operations.
 The result is a tree sequence in which very few edges
 {ref}`change from tree to tree<fig_what_is_edge_diffs>`.
-This is the underlying reason that `tskit` is so
+This is the underlying reason that {program}`tskit` is so
 efficient, and is well illustrated in the example tree sequence above.
 
 In this (simulated) tree sequence, each tree differs from the next by a single SPR.
-The subtree defined by node 7 in the first tree has been pruned and regrafted onto the
-branch between 0 and 10, to create the second tree. The second and third trees have the
-same topology, but differ because their ultimate coalesence happened in a different
-ancestor (easy to spot in a simulation, but hard to detect in real data). This is also
+The subtree defined by node 7 in the first tree has been pruned (away from node 11) and
+regrafted onto the branch between 0 and 9, to create the second tree.
+The second and third trees have the same topology,
+but differ because their ultimate coalescence happened in a different ancestor
+(easy to spot in a simulation, but hard to detect in real data). This is also
 caused by a single SPR: looking at the second tree, either the subtree below node 8 or
 the subtree below node 9 must have been pruned and regrafted higher up on the same
 lineage to create the third tree. Because this is a fully {ref}`simplified<sec_simplification>`
@@ -404,28 +401,43 @@ know this, we would need to have a tree sequence with the exact history of recom
 recorded (see below).
 
 In general, each detectable recombination occurring in ancestral history results in a
-single SPR in the tree sequence. If recombination breakpoints occurs at unique
+single SPR in the tree sequence. If recombination breakpoints occur at unique
 positions (an "infinite sites" model of breakpoints), then the number of trees in a tree
 sequence equals the number of ancestral recombination events plus one. If recombinations
 can occur at the same physical position (e.g. if the genome is treated as a set of
 discrete integer positions, as in the simulation that created this tree sequence) then
-moving from  one tree to the next in a tree sequence might require multiple SPRs if
+moving from one tree to the next in a tree sequence might require multiple SPRs if
 there are multiple, overlaid ancestral recombination events.
 
+### Tables
+
+The underlying data in a tree sequence are stored in a collection of tables. Details of
+these tables, as well as some basic properties of the tree sequence which they encode,
+are shown when printing a tree sequence to the screen:
+
+```{code-cell} ipython3
+ts  # or use `print(ts)` for the plain text representation
+```
+
+There is a separate API for dealing with a tree sequence as a collection of tables,
+which can be useful for dealing with large datasets, and is needed if you want to
+{ref}`edit<sec_tables_editing>` an existing tree sequence. This is covered in
+the {ref}`sec_tables` tutorial.
+
 (sec_concepts_args)=
 
 ### Tree sequences and ARGs
 
 ::::{margin}
 :::{note}
-There is a subtle distinction between common ancestry and coalescence. In particular, all coalescent nodes are common ancestor events, but not all common ancestor events in an ARG result in coalescence in a local tree.
+There is a subtle distinction between common ancestry and coalescence. In particular, all coalescent nodes are common ancestor events, but not all common ancestor events in an ARG result in coalescence in all local trees.
 :::
 ::::
 
-The term "Ancestral Recombination Graph", or ARG, is commonly used to describe a genetic
+The term Ancestral Recombination Graph (ARG), is commonly used to describe a genetic
 genealogy. In particular, many (but not all) authors use it to mean a genetic
 genealogy in which details of the position and potentially the timing of all
-recombination and common ancestor events are explictly stored. For clarity
+recombination and common ancestor events are explicitly stored. For clarity
 we refer to this sort of genetic genealogy as a "full ARG". Succinct tree sequences can
 represent many different sorts of ARGs, including "full ARGs", by incorporating extra
 non-coalescent nodes (see the {ref}`sec_args` tutorial). However, tree sequences are
@@ -438,11 +450,11 @@ which omits these extra nodes. This is for two main reasons:
 2. The number of recombination and non-coalescing common ancestor events in the genealogy
    quickly grows to dominate the total number of nodes in the tree sequence,
    without actually contributing to the mutations inherited by the samples.
-   In other words, these nodes are redundant to the storing of genome data.
+   In other words, these nodes are redundant to the storing of genomic data.
 
 Therefore, compared to a full ARG, you can think of a simplified tree sequence as
 storing the trees *created by* recombination events, rather than attempting to record the
-recombination events themselves. The actual recombination events can be sometimes be
+recombination events themselves. The actual recombination events can sometimes be
 inferred from these trees but, as we have seen, it's not always possible. Here's another
 way to put it:
 
@@ -450,20 +462,7 @@ way to put it:
 > whereas a [simplified] tree sequence encodes the outcome of those events"
 > ([Kelleher _et al._, 2019](https://doi.org/10.1534/genetics.120.303253))
 
-
-### Tables
-
-The underlying data in a tree sequence are stored in a collection of tables. Details of
-these tables, as well as some basic properties of the tree sequence which they encode,
-are shown when printing a tree sequence to the screen:
-
-```{code-cell} ipython3
-ts  # or use `print(ts)` for the plain text representation
-```
-
-There is a separate API for dealing with a tree sequence as a collection of tables,
-which can be useful for dealing with large datasets, and is needed if you want to
-{ref}`edit<sec_tables_editing>` an existing tree sequence. This is covered in
-the {ref}`sec_tables` tutorial.
+[Wong _et al._, 2024](https://doi.org/10.1093/genetics/iyae100)
+review this topic in detail.
 
 
diff --git a/viz.md b/viz.md
index 2c806b92..6de6a5aa 100644
--- a/viz.md
+++ b/viz.md
@@ -2228,7 +2228,7 @@ def size_max(graph):
 # See https://popsim-consortium.github.io/demes-docs/ for the yml spec for the file below
 graph = demes.load("data/whatis_example.yml")
 w = 1.5 * size_max(graph)
-positions = dict(Ancestral_population=0, A=-w, B=w)
+positions = dict(Ancestral_population=0, P=-w, Q=w)
 ax = demesdraw.tubes(graph, positions=positions, seed=1)
 plt.show(ax.figure)
 ```
diff --git a/what_is.md b/what_is.md
index faf89a6c..31747f34 100644
--- a/what_is.md
+++ b/what_is.md
@@ -13,8 +13,10 @@ kernelspec:
 
 ```{code-cell} ipython3
 :tags: [remove-cell]
-import msprime
 import demes
+import msprime
+import numpy as np
+import tskit
 
 
 def whatis_example():
@@ -29,9 +31,9 @@ def whatis_example():
           - name: Ancestral_population
             epochs:
               - end_time: 1000
-          - name: A
+          - name: P
             ancestors: [Ancestral_population]
-          - name: B
+          - name: Q
             ancestors: [Ancestral_population]
             epochs:
               - start_size: 2000
@@ -39,8 +41,8 @@ def whatis_example():
               - start_size: 400
                 end_size: 10000
         migrations:
-          - source: A
-            dest: B
+          - source: P
+            dest: Q
             rate: 1e-4
         """
     with open("data/whatis_example.yml", "wt") as f:
@@ -49,18 +51,51 @@ def whatis_example():
     demography = msprime.Demography.from_demes(graph)
     # Choose seed so num_trees=3, tips are in same order,
     # first 2 trees are topologically different, and all trees have the same root
-    seed = 12581
-    ts = msprime.sim_ancestry(
-        samples={"A": 2, "B": 3},
-        demography=demography,
-        recombination_rate=1e-8,
-        sequence_length=1000,
-        random_seed=seed)
+    for seed in range(21964, 10000000):
+        ts = msprime.sim_ancestry(
+            samples={"P": 2, "Q": 3},
+            demography=demography,
+            recombination_rate=1e-8,
+            sequence_length=900,
+            additional_nodes=(
+                 msprime.NodeType.RECOMBINANT |
+                 msprime.NodeType.COMMON_ANCESTOR
+                 ),
+            coalescing_segments_only=False,
+            random_seed=seed)
+        if ts.num_trees != 3:
+            continue
+        if not len(set(tree.root for tree in ts.trees())) == 1:
+            continue
+        sts = ts.simplify()
+        if sts.num_trees != ts.num_trees:
+            continue
+        if sts.at_index(0).rank() == sts.at_index(1).rank():
+            continue
+        if sts.at_index(1).rank() == sts.at_index(2).rank():
+            continue
+        samps = [list(u for u in tree.nodes(order="minlex_postorder") if tree.is_sample(u)) for tree in sts.trees()]
+        if samps[0] != samps[1] or samps[0] != samps[2]:
+            continue
+        print(f"Used demography seed {seed}")
+        break
     # Mutate
-    # Choose seed to give 12 muts, last one above node 14
-    seed = 1476
-    ts = msprime.sim_mutations(ts, rate=1e-7, random_seed=seed)
+    # Choose seed to give 12 muts, last one above node 15
+    for seed in range(273, 10000):
+        mts = msprime.sim_mutations(ts, rate=2e-7, random_seed=seed)
+        if mts.num_mutations != 12:
+            continue
+        if mts.mutation(-1).node != 15:
+            continue
+        print(f"Used mutations seed {seed}")
+        break
+    # put the mutations along midpoints of branches
+    tables = mts.dump_tables()
+    tables.mutations.time = np.full_like(tables.mutations.time, tskit.UNKNOWN_TIME)
+    ts = tables.tree_sequence()
     ts.dump("data/whatis_example.trees")
+    return ts
+
 
 def create_notebook_data():
     whatis_example()
@@ -77,9 +112,11 @@ relationships between a set of DNA sequences. Tree sequences are based on fundam
 biological principles of inheritance, DNA duplication, mutation, and recombination;
 they can be created by [evolutionary simulation](https://tskit.dev/software/#simulate)
 or by [inferring genealogies from empirical DNA data](https://tskit.dev/software/#infer).
+Technically, this means they
+{ref}`can be used to represent "ancestral recombination graphs"<sec_concepts_args>`, or ARGs.
 
 :::{margin} Key point
-Tree sequences are used to encode and analyse large genetic datasets
+Tree sequences are used to encode and analyse large genetic datasets.
 :::
 
 Tree sequences provide an efficient way of storing
@@ -140,10 +177,15 @@ plt.show()
 ### A sequence of trees...
 
 As the name suggests, the simplest way to think about a tree sequence is that it
-describes a sequence of correlated "local trees" --- i.e. genetic trees located at
+describes a sequence of correlated "local trees" --- i.e. evolutionary trees located at
 different points along a [chromosome](https://en.wikipedia.org/wiki/Chromosome).
-Here's a tiny example based on ten haploid genomes, $\mathrm{a}$ to $\mathrm{j}$,
-spanning a short 1000 letter chromosome.
+
+Below is a tiny example based on 10 haploid copies of a short chromosome spanning only
+900 basepairs (letters) in length. Each chromosome copy, which we can think of as a
+small genome, is shown as a square symbol. These are labelled $\mathrm{a}$ to $\mathrm{j}$
+and represent "sampled" lengths of DNA, whose ancestry and genetic
+variation is stored in the tree sequence.  The tickmarks on the X axis and their
+associated background shading indicate the genomic positions covered by the local trees.
 
 ```{code-cell} ipython3
 :"tags": ["hide-input"]
@@ -156,14 +198,23 @@ ts = mutated_ts.delete_sites(list(range(mutated_ts.num_sites)))
 labels = {i: string.ascii_lowercase[i] for i in range(ts.num_nodes)}
 genome_order = [n for n in ts.first().nodes(order="minlex_postorder") if ts.node(n).is_sample()]
 labels.update({n: labels[i] for i, n in enumerate(genome_order)})
+rev_lab = {v: k for k, v in labels.items()}
 style1 = (
     ".node:not(.sample) > .sym, .node:not(.sample) > .lab {visibility: hidden;}"
-    ".mut {font-size: 12px} .y-axis .tick .lab {font-size: 85%}")
+    ".mut {font-size: 12px} .y-axis .tick .lab {font-size: 85%}"
+)
+highlt = ".t0 .n%(u)i .lab, .t1 .n%(u)i .lab {font-weight: bold; fill: red}" % {"u": rev_lab["e"]}
+
 sz = (800, 250)  # size of the plot, slightly larger than the default
-ticks = [0, 5000, 10000, 15000, 20000]
+ticks = [0, 5000, 10000, 15000]
 ts.draw_svg(
-    size=sz, node_labels=labels, style=style1, y_label="Time ago",
-    y_axis=True, y_ticks=ticks)
+    size=sz,
+    node_labels=labels,
+    style=style1 + highlt,
+    y_label="Time ago",
+    y_axis=True,
+    y_ticks=ticks,
+)
 ```
 
 ::::{margin}
@@ -173,30 +224,52 @@ the nodes are referred to by {ref}`numerical ID<sec_terminology_nodes>`.
 :::
 ::::
 
-The tickmarks on the X axis and background shading indicate the genomic positions covered
-by the trees. The tickmarks indicate recombination events that explain relationships
-between the ten genomes. There were two such recombination events, giving us three local trees.
-For the first short portion of the chromosome, from the start until position 189,
+
+For the first short portion of the chromosome, from the start until position 367,
 the relationships between the ten genomes are shown by the first tree.
-The second tree shows the relationships between positions 189 and 546.
-By inspecting the first and the second local tree we can see that genomes $\mathrm{b}-\mathrm{f}$
-changed their "most recent common ancestor" (MRCA) with genome $\mathrm{a}$ to
-MRCA with genome $\mathrm{g}$.
-The third tree shows the relationships between positions 546 and 1000 (the end).
-By inspecting the second and the third local tree we can see that
-recombination changed the ancestry of genomes $\mathrm{b}-\mathrm{f}$
-back to shared MRCA with genome $\mathrm{g}$.
+The second tree shows the relationships between positions 367 and 600.
+Note that in the first tree, genome $\mathrm{e}$ (highlighted in red) is closest to
+$\mathrm{a}-\mathrm{d}$, whereas in the second tree $\mathrm{e}$ is closest to
+$\mathrm{f}-\mathrm{h}$. The third tree shows the relationships between positions
+600 and 900 (the end of the genome). In this case, an entire subtree (or "clade"),
+composed of nodes $\mathrm{e}-\mathrm{h}$ has changed its relationship with the other six 
+genomes. More specifically, the most recent common ancestor (MRCA) with any of these
+others has switched: in the third tree it is now above $\mathrm{i}$ and $\mathrm{j}$.
 
 (sec_what_is_genealogical_network)=
 
 ### ... created by a genealogical network
 
-In fact, succinct tree sequences don't store each tree separately, but instead are
-based on an interconnected *genetic genealogy*, in which
-[genetic recombination](https://en.wikipedia.org/wiki/Genetic_recombination) has led
+In fact, succinct tree sequences don't store each tree separately, but are instead
+based on a network of *genetic ancestry*, in which
+[genetic recombination](https://en.wikipedia.org/wiki/Genetic_recombination)
+(represented by red symbols below) has led
 to different regions of the chromosome having different histories. Another way of
-thinking about the tree sequence above is that it describes the full genetic ancestry
-of our 10 genomes.
+thinking about the tree sequence above is that it describes the full genetic genealogy
+of our 10 genomes. If we combine all the relationships encoded in the trees (you can
+loosely think of this as lying the trees on top of each other), the result is a network or
+_graph_ (hence the term "[ARG](https://doi.org/10.1371/journal.pgen.1011110)")
+
+```{code-cell} ipython3
+:"tags": ["hide-input"]
+import json
+import tskit_arg_visualizer as argviz
+d3arg = argviz.D3ARG.from_ts(ts=ts)
+with open("data/whatis_example.json") as f:
+    d3arg.set_node_x_positions(
+        argviz.extract_x_positions_from_json(json.load(f))
+    )
+d3arg.nodes.loc[:, 'label'] = ""
+d3arg.nodes.loc[d3arg.nodes.ts_flags & msprime.NODE_IS_RE_EVENT != 0, 'fill'] = "red"
+d3arg.set_node_labels({u: labels[u] for u in ts.samples()})
+d3arg.set_all_node_styles(size=50, stroke_width=3)
+d3arg.set_node_styles({u: {"symbol": "d3.symbolSquare", "fill": "black"} for u in ts.samples()})
+d3arg.draw(
+    title="An interactive graph representation\n(hover over the bottom bar to see the trees)",
+    edge_type="ortho",
+    height=220);
+```
+
 
 (sec_what_is_dna_data)=
 
@@ -226,12 +299,12 @@ branches of the trees, and the positions of the twelve variable sites associated
 mutations are shown along the X axis.
 
 :::{margin} Key point
-Mutation on trees are the source of genetic variation
+Mutation on trees are the source of genetic variation.
 :::
 
-The trees inform us that, for example, the final mutation (at position 987) is inherited
-by genomes $\mathrm{h}$ to $\mathrm{j}$. These genomes must have an *T* at that position,
-compared to the original value of *G*. In other words, once we know the ancestry, placing
+The trees inform us that, for example, the final mutation (at position 851) is inherited
+by genomes $\mathrm{i}$ and $\mathrm{j}$. These genomes must have an *C* at that position,
+compared to the original value of *A*. In other words, once we know the ancestry, placing
 a relatively small number of mutations is enough to explain all the observed genetic
 variation. Here's the resulting "variant matrix":
 
@@ -266,7 +339,7 @@ ts.draw_svg(
 ```
 
 :::{margin} Key point
-Tree sequences are efficient because they don't store each tree separately
+Tree sequences are efficient because they don't store each tree separately.
 :::
 
 A branch can be shared by many adjacent trees, but is stored as a single edge in the tree
@@ -312,14 +385,6 @@ plt.show()
 
 ## A record of genetic ancestry
 
-::::{margin}
-:::{note}
-The genetic genealogy is sometimes referred to as an ancestral recombination graph,
-or ARG, and one way to think of tskit tree sequence is as a way
-to store various different sorts of ARGs (see the {ref}`ARG tutorial<sec_args>`)
-:::
-::::
-
 Often, we're not interested so much in the DNA sequence data as the genetic ancestry
 itself (discussed e.g. [here](https://www.nature.com/articles/s41588-019-0492-x)).
 In other words, the main consideration is the actual trees in a tree sequence, rather
@@ -333,8 +398,8 @@ The tree sequence in this tutorial was actually generated using a model of popul
 splits and expansions as shown in the following schematic,
 {ref}`plotted<sec_tskit_viz_other_demographic>` using the
 [DemesDraw](https://pypi.org/project/demesdraw/) package. Our 10 genomes were sampled
-from modern day populations A (a constant-size population) and B (a recently expanding
-one), where limited migration is occuring from A to B.
+from modern day populations P (a constant-size population) and Q (a recently expanding
+one), where limited migration is occuring from P to Q.
 
 ```{code-cell} ipython3
 :"tags": ["remove-input"]
@@ -352,7 +417,7 @@ def size_max(graph):
 
 graph = demes.load("data/whatis_example.yml")
 w = 1.5 * size_max(graph)
-positions = dict(Ancestral_population=0, A=-w, B=w)
+positions = dict(Ancestral_population=0, P=-w, Q=w)
 fig, ax = plt.subplots(1, figsize=(5, 3))
 ax = demesdraw.tubes(graph, ax=ax, positions=positions, seed=1)
 plt.show(ax.figure)
@@ -376,7 +441,7 @@ style2 = ".y-axis .tick .lab {font-size: 85%}"
 style2 += "#svg2 .node > .sym {visibility: visible;}"  # force-show all nodes: not normally needed
 style2 += "".join([f".p{n.population} > .sym {{fill: {colours[n.population]}}}" for n in ts.nodes()])
 
-mutated_ts.draw_svg(
+mutated_ts.simplify().draw_svg(
     size=sz, root_svg_attributes={'id':'svg2'}, y_label="Time ago (generations)",
     y_axis=True, y_ticks=ticks, node_labels=labels, mutation_labels={}, style=style2)
 ```
@@ -389,7 +454,7 @@ deduce these MRCA genomes, simply by looking at which mutations they have inheri
 ```{code-cell} ipython3
 :"tags": ["hide-input"]
 import numpy as np
-tables = mutated_ts.dump_tables()
+tables = mutated_ts.simplify().dump_tables()
 # Flip sample and nonsample flags, making the haplotypes() method print out nonsample nodes
 s_flags = tables.nodes.flags[ts.samples()[0]]
 no_flags = s_flags-s_flags
@@ -428,7 +493,7 @@ multiple correlated trees along a genome.
 
 ```{margin} Key point
 Most genetic calculations involve iterating over trees, which is highly efficient in
-{program}`tskit`
+{program}`tskit`.
 ```
 
 For example, statistical measures of genetic variation can be thought of as a calculation
@@ -466,7 +531,7 @@ The {program}`tskit` library has {ref}`extensive support<sec_analysing_tree_sequ
 for these sorts of population genetic calculations. It provides efficient methods for
 traversing through large {ref}`trees<sec_analysing_trees_traversals>` and
 {ref}`tree sequences<sec_processing_trees>`, as well as providing other
-phylogenetically relevant methods such as
+relevant methods such as principal component analysis, 
 {ref}`parsimonious placement of mutations<sec_analysing_trees_parsimony>`,
 and the {ref}`counting of topologies<sec_counting_topologies>` embedded within
 larger trees.