And if anyone wants to check my work or just play along at home, what I did was install the Levenshtein python package (‘pip3 install python-Levenshtein’) and run the following code:
#!/usr/local/bin/python3
import Levenshtein as lev
import matplotlib.pyplot as plt
import random
def main() :
gc_content = .41
niter = 1000
k = 100
ngc = nat = nmixed = ntransition = 0
for it in range(niter) :
# pick two strings, each k long, of a, c, g, t, with frequencies determined by gc_content
al = random.choices(['a', 'c', 'g', 't'], weights=[0.5*(1-gc_content), 0.5*gc_content,
0.5*gc_content, 0.5*(1-gc_content)], k=k)
bl = random.choices(['a', 'c', 'g', 't'], weights=[0.5*(1-gc_content), 0.5*gc_content,
0.5*gc_content, 0.5*(1-gc_content)], k=k)
a = ''.join(al)
b = ''.join(bl)
# lev.opcodes does all the work -- it provides a list of edits to turn a into b
for tag, i1, i2, j1, j2 in lev.opcodes(a, b) :
if tag == 'replace' : # substitutions only
if i2 - i1 > 1 or j2 - j1 > 1 : continue # single character only
if a[i1] == 'c' and b[j1] == 'g' : ngc += 1
elif a[i1] == 'g' and b[j1] == 'c' : ngc += 1
elif a[i1] == 'a' and b[j1] == 't' : nat += 1
elif a[i1] == 't' and b[j1] == 'a' : nat += 1
elif a[i1] == 'a' and b[j1] == 'c' : nmixed += 1
elif a[i1] == 'c' and b[j1] == 'a' : nmixed += 1
elif a[i1] == 'g' and b[j1] == 't' : nmixed += 1
elif a[i1] == 't' and b[j1] == 'g' : nmixed += 1
elif a[i1] == 'c' and b[j1] == 't' : ntransition += 1
elif a[i1] == 't' and b[j1] == 'c' : ntransition += 1
elif a[i1] == 'a' and b[j1] == 'g' : ntransition += 1
elif a[i1] == 'g' and b[j1] == 'a' : ntransition += 1
else : print('huh?', a[i1], b[j1])
print(ngc, nat, nmixed, ntransition)
fig, ax = plt.subplots()
types = ['Transition', 'G<->C', 'A<->T', 'A<->C/G<->T']
ntot = niter * k
counts = [ntransition, ngc, nat, nmixed]
rates = [ntransition/ntot, ngc/ntot/gc_content, nat/ntot/(1-gc_content), nmixed/ntot]
ax.bar(types, rates)
ax.set_title('Levenshtein replacements, random sequence (per available base)')
# These give the uncorrected values:
# ax.bar(types, counts)
# ax.set_title('Levenshtein replacements, random sequence (uncorrected)')
fig.savefig('lev.pdf', bbox_inches='tight')
main()