This file is indexed.

/usr/share/doc/mira-assembler/DefinitiveGuideToMIRA.html is in mira-doc 4.9.5-5.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221
  222
  223
  224
  225
  226
  227
  228
  229
  230
  231
  232
  233
  234
  235
  236
  237
  238
  239
  240
  241
  242
  243
  244
  245
  246
  247
  248
  249
  250
  251
  252
  253
  254
  255
  256
  257
  258
  259
  260
  261
  262
  263
  264
  265
  266
  267
  268
  269
  270
  271
  272
  273
  274
  275
  276
  277
  278
  279
  280
  281
  282
  283
  284
  285
  286
  287
  288
  289
  290
  291
  292
  293
  294
  295
  296
  297
  298
  299
  300
  301
  302
  303
  304
  305
  306
  307
  308
  309
  310
  311
  312
  313
  314
  315
  316
  317
  318
  319
  320
  321
  322
  323
  324
  325
  326
  327
  328
  329
  330
  331
  332
  333
  334
  335
  336
  337
  338
  339
  340
  341
  342
  343
  344
  345
  346
  347
  348
  349
  350
  351
  352
  353
  354
  355
  356
  357
  358
  359
  360
  361
  362
  363
  364
  365
  366
  367
  368
  369
  370
  371
  372
  373
  374
  375
  376
  377
  378
  379
  380
  381
  382
  383
  384
  385
  386
  387
  388
  389
  390
  391
  392
  393
  394
  395
  396
  397
  398
  399
  400
  401
  402
  403
  404
  405
  406
  407
  408
  409
  410
  411
  412
  413
  414
  415
  416
  417
  418
  419
  420
  421
  422
  423
  424
  425
  426
  427
  428
  429
  430
  431
  432
  433
  434
  435
  436
  437
  438
  439
  440
  441
  442
  443
  444
  445
  446
  447
  448
  449
  450
  451
  452
  453
  454
  455
  456
  457
  458
  459
  460
  461
  462
  463
  464
  465
  466
  467
  468
  469
  470
  471
  472
  473
  474
  475
  476
  477
  478
  479
  480
  481
  482
  483
  484
  485
  486
  487
  488
  489
  490
  491
  492
  493
  494
  495
  496
  497
  498
  499
  500
  501
  502
  503
  504
  505
  506
  507
  508
  509
  510
  511
  512
  513
  514
  515
  516
  517
  518
  519
  520
  521
  522
  523
  524
  525
  526
  527
  528
  529
  530
  531
  532
  533
  534
  535
  536
  537
  538
  539
  540
  541
  542
  543
  544
  545
  546
  547
  548
  549
  550
  551
  552
  553
  554
  555
  556
  557
  558
  559
  560
  561
  562
  563
  564
  565
  566
  567
  568
  569
  570
  571
  572
  573
  574
  575
  576
  577
  578
  579
  580
  581
  582
  583
  584
  585
  586
  587
  588
  589
  590
  591
  592
  593
  594
  595
  596
  597
  598
  599
  600
  601
  602
  603
  604
  605
  606
  607
  608
  609
  610
  611
  612
  613
  614
  615
  616
  617
  618
  619
  620
  621
  622
  623
  624
  625
  626
  627
  628
  629
  630
  631
  632
  633
  634
  635
  636
  637
  638
  639
  640
  641
  642
  643
  644
  645
  646
  647
  648
  649
  650
  651
  652
  653
  654
  655
  656
  657
  658
  659
  660
  661
  662
  663
  664
  665
  666
  667
  668
  669
  670
  671
  672
  673
  674
  675
  676
  677
  678
  679
  680
  681
  682
  683
  684
  685
  686
  687
  688
  689
  690
  691
  692
  693
  694
  695
  696
  697
  698
  699
  700
  701
  702
  703
  704
  705
  706
  707
  708
  709
  710
  711
  712
  713
  714
  715
  716
  717
  718
  719
  720
  721
  722
  723
  724
  725
  726
  727
  728
  729
  730
  731
  732
  733
  734
  735
  736
  737
  738
  739
  740
  741
  742
  743
  744
  745
  746
  747
  748
  749
  750
  751
  752
  753
  754
  755
  756
  757
  758
  759
  760
  761
  762
  763
  764
  765
  766
  767
  768
  769
  770
  771
  772
  773
  774
  775
  776
  777
  778
  779
  780
  781
  782
  783
  784
  785
  786
  787
  788
  789
  790
  791
  792
  793
  794
  795
  796
  797
  798
  799
  800
  801
  802
  803
  804
  805
  806
  807
  808
  809
  810
  811
  812
  813
  814
  815
  816
  817
  818
  819
  820
  821
  822
  823
  824
  825
  826
  827
  828
  829
  830
  831
  832
  833
  834
  835
  836
  837
  838
  839
  840
  841
  842
  843
  844
  845
  846
  847
  848
  849
  850
  851
  852
  853
  854
  855
  856
  857
  858
  859
  860
  861
  862
  863
  864
  865
  866
  867
  868
  869
  870
  871
  872
  873
  874
  875
  876
  877
  878
  879
  880
  881
  882
  883
  884
  885
  886
  887
  888
  889
  890
  891
  892
  893
  894
  895
  896
  897
  898
  899
  900
  901
  902
  903
  904
  905
  906
  907
  908
  909
  910
  911
  912
  913
  914
  915
  916
  917
  918
  919
  920
  921
  922
  923
  924
  925
  926
  927
  928
  929
  930
  931
  932
  933
  934
  935
  936
  937
  938
  939
  940
  941
  942
  943
  944
  945
  946
  947
  948
  949
  950
  951
  952
  953
  954
  955
  956
  957
  958
  959
  960
  961
  962
  963
  964
  965
  966
  967
  968
  969
  970
  971
  972
  973
  974
  975
  976
  977
  978
  979
  980
  981
  982
  983
  984
  985
  986
  987
  988
  989
  990
  991
  992
  993
  994
  995
  996
  997
  998
  999
 1000
 1001
 1002
 1003
 1004
 1005
 1006
 1007
 1008
 1009
 1010
 1011
 1012
 1013
 1014
 1015
 1016
 1017
 1018
 1019
 1020
 1021
 1022
 1023
 1024
 1025
 1026
 1027
 1028
 1029
 1030
 1031
 1032
 1033
 1034
 1035
 1036
 1037
 1038
 1039
 1040
 1041
 1042
 1043
 1044
 1045
 1046
 1047
 1048
 1049
 1050
 1051
 1052
 1053
 1054
 1055
 1056
 1057
 1058
 1059
 1060
 1061
 1062
 1063
 1064
 1065
 1066
 1067
 1068
 1069
 1070
 1071
 1072
 1073
 1074
 1075
 1076
 1077
 1078
 1079
 1080
 1081
 1082
 1083
 1084
 1085
 1086
 1087
 1088
 1089
 1090
 1091
 1092
 1093
 1094
 1095
 1096
 1097
 1098
 1099
 1100
 1101
 1102
 1103
 1104
 1105
 1106
 1107
 1108
 1109
 1110
 1111
 1112
 1113
 1114
 1115
 1116
 1117
 1118
 1119
 1120
 1121
 1122
 1123
 1124
 1125
 1126
 1127
 1128
 1129
 1130
 1131
 1132
 1133
 1134
 1135
 1136
 1137
 1138
 1139
 1140
 1141
 1142
 1143
 1144
 1145
 1146
 1147
 1148
 1149
 1150
 1151
 1152
 1153
 1154
 1155
 1156
 1157
 1158
 1159
 1160
 1161
 1162
 1163
 1164
 1165
 1166
 1167
 1168
 1169
 1170
 1171
 1172
 1173
 1174
 1175
 1176
 1177
 1178
 1179
 1180
 1181
 1182
 1183
 1184
 1185
 1186
 1187
 1188
 1189
 1190
 1191
 1192
 1193
 1194
 1195
 1196
 1197
 1198
 1199
 1200
 1201
 1202
 1203
 1204
 1205
 1206
 1207
 1208
 1209
 1210
 1211
 1212
 1213
 1214
 1215
 1216
 1217
 1218
 1219
 1220
 1221
 1222
 1223
 1224
 1225
 1226
 1227
 1228
 1229
 1230
 1231
 1232
 1233
 1234
 1235
 1236
 1237
 1238
 1239
 1240
 1241
 1242
 1243
 1244
 1245
 1246
 1247
 1248
 1249
 1250
 1251
 1252
 1253
 1254
 1255
 1256
 1257
 1258
 1259
 1260
 1261
 1262
 1263
 1264
 1265
 1266
 1267
 1268
 1269
 1270
 1271
 1272
 1273
 1274
 1275
 1276
 1277
 1278
 1279
 1280
 1281
 1282
 1283
 1284
 1285
 1286
 1287
 1288
 1289
 1290
 1291
 1292
 1293
 1294
 1295
 1296
 1297
 1298
 1299
 1300
 1301
 1302
 1303
 1304
 1305
 1306
 1307
 1308
 1309
 1310
 1311
 1312
 1313
 1314
 1315
 1316
 1317
 1318
 1319
 1320
 1321
 1322
 1323
 1324
 1325
 1326
 1327
 1328
 1329
 1330
 1331
 1332
 1333
 1334
 1335
 1336
 1337
 1338
 1339
 1340
 1341
 1342
 1343
 1344
 1345
 1346
 1347
 1348
 1349
 1350
 1351
 1352
 1353
 1354
 1355
 1356
 1357
 1358
 1359
 1360
 1361
 1362
 1363
 1364
 1365
 1366
 1367
 1368
 1369
 1370
 1371
 1372
 1373
 1374
 1375
 1376
 1377
 1378
 1379
 1380
 1381
 1382
 1383
 1384
 1385
 1386
 1387
 1388
 1389
 1390
 1391
 1392
 1393
 1394
 1395
 1396
 1397
 1398
 1399
 1400
 1401
 1402
 1403
 1404
 1405
 1406
 1407
 1408
 1409
 1410
 1411
 1412
 1413
 1414
 1415
 1416
 1417
 1418
 1419
 1420
 1421
 1422
 1423
 1424
 1425
 1426
 1427
 1428
 1429
 1430
 1431
 1432
 1433
 1434
 1435
 1436
 1437
 1438
 1439
 1440
 1441
 1442
 1443
 1444
 1445
 1446
 1447
 1448
 1449
 1450
 1451
 1452
 1453
 1454
 1455
 1456
 1457
 1458
 1459
 1460
 1461
 1462
 1463
 1464
 1465
 1466
 1467
 1468
 1469
 1470
 1471
 1472
 1473
 1474
 1475
 1476
 1477
 1478
 1479
 1480
 1481
 1482
 1483
 1484
 1485
 1486
 1487
 1488
 1489
 1490
 1491
 1492
 1493
 1494
 1495
 1496
 1497
 1498
 1499
 1500
 1501
 1502
 1503
 1504
 1505
 1506
 1507
 1508
 1509
 1510
 1511
 1512
 1513
 1514
 1515
 1516
 1517
 1518
 1519
 1520
 1521
 1522
 1523
 1524
 1525
 1526
 1527
 1528
 1529
 1530
 1531
 1532
 1533
 1534
 1535
 1536
 1537
 1538
 1539
 1540
 1541
 1542
 1543
 1544
 1545
 1546
 1547
 1548
 1549
 1550
 1551
 1552
 1553
 1554
 1555
 1556
 1557
 1558
 1559
 1560
 1561
 1562
 1563
 1564
 1565
 1566
 1567
 1568
 1569
 1570
 1571
 1572
 1573
 1574
 1575
 1576
 1577
 1578
 1579
 1580
 1581
 1582
 1583
 1584
 1585
 1586
 1587
 1588
 1589
 1590
 1591
 1592
 1593
 1594
 1595
 1596
 1597
 1598
 1599
 1600
 1601
 1602
 1603
 1604
 1605
 1606
 1607
 1608
 1609
 1610
 1611
 1612
 1613
 1614
 1615
 1616
 1617
 1618
 1619
 1620
 1621
 1622
 1623
 1624
 1625
 1626
 1627
 1628
 1629
 1630
 1631
 1632
 1633
 1634
 1635
 1636
 1637
 1638
 1639
 1640
 1641
 1642
 1643
 1644
 1645
 1646
 1647
 1648
 1649
 1650
 1651
 1652
 1653
 1654
 1655
 1656
 1657
 1658
 1659
 1660
 1661
 1662
 1663
 1664
 1665
 1666
 1667
 1668
 1669
 1670
 1671
 1672
 1673
 1674
 1675
 1676
 1677
 1678
 1679
 1680
 1681
 1682
 1683
 1684
 1685
 1686
 1687
 1688
 1689
 1690
 1691
 1692
 1693
 1694
 1695
 1696
 1697
 1698
 1699
 1700
 1701
 1702
 1703
 1704
 1705
 1706
 1707
 1708
 1709
 1710
 1711
 1712
 1713
 1714
 1715
 1716
 1717
 1718
 1719
 1720
 1721
 1722
 1723
 1724
 1725
 1726
 1727
 1728
 1729
 1730
 1731
 1732
 1733
 1734
 1735
 1736
 1737
 1738
 1739
 1740
 1741
 1742
 1743
 1744
 1745
 1746
 1747
 1748
 1749
 1750
 1751
 1752
 1753
 1754
 1755
 1756
 1757
 1758
 1759
 1760
 1761
 1762
 1763
 1764
 1765
 1766
 1767
 1768
 1769
 1770
 1771
 1772
 1773
 1774
 1775
 1776
 1777
 1778
 1779
 1780
 1781
 1782
 1783
 1784
 1785
 1786
 1787
 1788
 1789
 1790
 1791
 1792
 1793
 1794
 1795
 1796
 1797
 1798
 1799
 1800
 1801
 1802
 1803
 1804
 1805
 1806
 1807
 1808
 1809
 1810
 1811
 1812
 1813
 1814
 1815
 1816
 1817
 1818
 1819
 1820
 1821
 1822
 1823
 1824
 1825
 1826
 1827
 1828
 1829
 1830
 1831
 1832
 1833
 1834
 1835
 1836
 1837
 1838
 1839
 1840
 1841
 1842
 1843
 1844
 1845
 1846
 1847
 1848
 1849
 1850
 1851
 1852
 1853
 1854
 1855
 1856
 1857
 1858
 1859
 1860
 1861
 1862
 1863
 1864
 1865
 1866
 1867
 1868
 1869
 1870
 1871
 1872
 1873
 1874
 1875
 1876
 1877
 1878
 1879
 1880
 1881
 1882
 1883
 1884
 1885
 1886
 1887
 1888
 1889
 1890
 1891
 1892
 1893
 1894
 1895
 1896
 1897
 1898
 1899
 1900
 1901
 1902
 1903
 1904
 1905
 1906
 1907
 1908
 1909
 1910
 1911
 1912
 1913
 1914
 1915
 1916
 1917
 1918
 1919
 1920
 1921
 1922
 1923
 1924
 1925
 1926
 1927
 1928
 1929
 1930
 1931
 1932
 1933
 1934
 1935
 1936
 1937
 1938
 1939
 1940
 1941
 1942
 1943
 1944
 1945
 1946
 1947
 1948
 1949
 1950
 1951
 1952
 1953
 1954
 1955
 1956
 1957
 1958
 1959
 1960
 1961
 1962
 1963
 1964
 1965
 1966
 1967
 1968
 1969
 1970
 1971
 1972
 1973
 1974
 1975
 1976
 1977
 1978
 1979
 1980
 1981
 1982
 1983
 1984
 1985
 1986
 1987
 1988
 1989
 1990
 1991
 1992
 1993
 1994
 1995
 1996
 1997
 1998
 1999
 2000
 2001
 2002
 2003
 2004
 2005
 2006
 2007
 2008
 2009
 2010
 2011
 2012
 2013
 2014
 2015
 2016
 2017
 2018
 2019
 2020
 2021
 2022
 2023
 2024
 2025
 2026
 2027
 2028
 2029
 2030
 2031
 2032
 2033
 2034
 2035
 2036
 2037
 2038
 2039
 2040
 2041
 2042
 2043
 2044
 2045
 2046
 2047
 2048
 2049
 2050
 2051
 2052
 2053
 2054
 2055
 2056
 2057
 2058
 2059
 2060
 2061
 2062
 2063
 2064
 2065
 2066
 2067
 2068
 2069
 2070
 2071
 2072
 2073
 2074
 2075
 2076
 2077
 2078
 2079
 2080
 2081
 2082
 2083
 2084
 2085
 2086
 2087
 2088
 2089
 2090
 2091
 2092
 2093
 2094
 2095
 2096
 2097
 2098
 2099
 2100
 2101
 2102
 2103
 2104
 2105
 2106
 2107
 2108
 2109
 2110
 2111
 2112
 2113
 2114
 2115
 2116
 2117
 2118
 2119
 2120
 2121
 2122
 2123
 2124
 2125
 2126
 2127
 2128
 2129
 2130
 2131
 2132
 2133
 2134
 2135
 2136
 2137
 2138
 2139
 2140
 2141
 2142
 2143
 2144
 2145
 2146
 2147
 2148
 2149
 2150
 2151
 2152
 2153
 2154
 2155
 2156
 2157
 2158
 2159
 2160
 2161
 2162
 2163
 2164
 2165
 2166
 2167
 2168
 2169
 2170
 2171
 2172
 2173
 2174
 2175
 2176
 2177
 2178
 2179
 2180
 2181
 2182
 2183
 2184
 2185
 2186
 2187
 2188
 2189
 2190
 2191
 2192
 2193
 2194
 2195
 2196
 2197
 2198
 2199
 2200
 2201
 2202
 2203
 2204
 2205
 2206
 2207
 2208
 2209
 2210
 2211
 2212
 2213
 2214
 2215
 2216
 2217
 2218
 2219
 2220
 2221
 2222
 2223
 2224
 2225
 2226
 2227
 2228
 2229
 2230
 2231
 2232
 2233
 2234
 2235
 2236
 2237
 2238
 2239
 2240
 2241
 2242
 2243
 2244
 2245
 2246
 2247
 2248
 2249
 2250
 2251
 2252
 2253
 2254
 2255
 2256
 2257
 2258
 2259
 2260
 2261
 2262
 2263
 2264
 2265
 2266
 2267
 2268
 2269
 2270
 2271
 2272
 2273
 2274
 2275
 2276
 2277
 2278
 2279
 2280
 2281
 2282
 2283
 2284
 2285
 2286
 2287
 2288
 2289
 2290
 2291
 2292
 2293
 2294
 2295
 2296
 2297
 2298
 2299
 2300
 2301
 2302
 2303
 2304
 2305
 2306
 2307
 2308
 2309
 2310
 2311
 2312
 2313
 2314
 2315
 2316
 2317
 2318
 2319
 2320
 2321
 2322
 2323
 2324
 2325
 2326
 2327
 2328
 2329
 2330
 2331
 2332
 2333
 2334
 2335
 2336
 2337
 2338
 2339
 2340
 2341
 2342
 2343
 2344
 2345
 2346
 2347
 2348
 2349
 2350
 2351
 2352
 2353
 2354
 2355
 2356
 2357
 2358
 2359
 2360
 2361
 2362
 2363
 2364
 2365
 2366
 2367
 2368
 2369
 2370
 2371
 2372
 2373
 2374
 2375
 2376
 2377
 2378
 2379
 2380
 2381
 2382
 2383
 2384
 2385
 2386
 2387
 2388
 2389
 2390
 2391
 2392
 2393
 2394
 2395
 2396
 2397
 2398
 2399
 2400
 2401
 2402
 2403
 2404
 2405
 2406
 2407
 2408
 2409
 2410
 2411
 2412
 2413
 2414
 2415
 2416
 2417
 2418
 2419
 2420
 2421
 2422
 2423
 2424
 2425
 2426
 2427
 2428
 2429
 2430
 2431
 2432
 2433
 2434
 2435
 2436
 2437
 2438
 2439
 2440
 2441
 2442
 2443
 2444
 2445
 2446
 2447
 2448
 2449
 2450
 2451
 2452
 2453
 2454
 2455
 2456
 2457
 2458
 2459
 2460
 2461
 2462
 2463
 2464
 2465
 2466
 2467
 2468
 2469
 2470
 2471
 2472
 2473
 2474
 2475
 2476
 2477
 2478
 2479
 2480
 2481
 2482
 2483
 2484
 2485
 2486
 2487
 2488
 2489
 2490
 2491
 2492
 2493
 2494
 2495
 2496
 2497
 2498
 2499
 2500
 2501
 2502
 2503
 2504
 2505
 2506
 2507
 2508
 2509
 2510
 2511
 2512
 2513
 2514
 2515
 2516
 2517
 2518
 2519
 2520
 2521
 2522
 2523
 2524
 2525
 2526
 2527
 2528
 2529
 2530
 2531
 2532
 2533
 2534
 2535
 2536
 2537
 2538
 2539
 2540
 2541
 2542
 2543
 2544
 2545
 2546
 2547
 2548
 2549
 2550
 2551
 2552
 2553
 2554
 2555
 2556
 2557
 2558
 2559
 2560
 2561
 2562
 2563
 2564
 2565
 2566
 2567
 2568
 2569
 2570
 2571
 2572
 2573
 2574
 2575
 2576
 2577
 2578
 2579
 2580
 2581
 2582
 2583
 2584
 2585
 2586
 2587
 2588
 2589
 2590
 2591
 2592
 2593
 2594
 2595
 2596
 2597
 2598
 2599
 2600
 2601
 2602
 2603
 2604
 2605
 2606
 2607
 2608
 2609
 2610
 2611
 2612
 2613
 2614
 2615
 2616
 2617
 2618
 2619
 2620
 2621
 2622
 2623
 2624
 2625
 2626
 2627
 2628
 2629
 2630
 2631
 2632
 2633
 2634
 2635
 2636
 2637
 2638
 2639
 2640
 2641
 2642
 2643
 2644
 2645
 2646
 2647
 2648
 2649
 2650
 2651
 2652
 2653
 2654
 2655
 2656
 2657
 2658
 2659
 2660
 2661
 2662
 2663
 2664
 2665
 2666
 2667
 2668
 2669
 2670
 2671
 2672
 2673
 2674
 2675
 2676
 2677
 2678
 2679
 2680
 2681
 2682
 2683
 2684
 2685
 2686
 2687
 2688
 2689
 2690
 2691
 2692
 2693
 2694
 2695
 2696
 2697
 2698
 2699
 2700
 2701
 2702
 2703
 2704
 2705
 2706
 2707
 2708
 2709
 2710
 2711
 2712
 2713
 2714
 2715
 2716
 2717
 2718
 2719
 2720
 2721
 2722
 2723
 2724
 2725
 2726
 2727
 2728
 2729
 2730
 2731
 2732
 2733
 2734
 2735
 2736
 2737
 2738
 2739
 2740
 2741
 2742
 2743
 2744
 2745
 2746
 2747
 2748
 2749
 2750
 2751
 2752
 2753
 2754
 2755
 2756
 2757
 2758
 2759
 2760
 2761
 2762
 2763
 2764
 2765
 2766
 2767
 2768
 2769
 2770
 2771
 2772
 2773
 2774
 2775
 2776
 2777
 2778
 2779
 2780
 2781
 2782
 2783
 2784
 2785
 2786
 2787
 2788
 2789
 2790
 2791
 2792
 2793
 2794
 2795
 2796
 2797
 2798
 2799
 2800
 2801
 2802
 2803
 2804
 2805
 2806
 2807
 2808
 2809
 2810
 2811
 2812
 2813
 2814
 2815
 2816
 2817
 2818
 2819
 2820
 2821
 2822
 2823
 2824
 2825
 2826
 2827
 2828
 2829
 2830
 2831
 2832
 2833
 2834
 2835
 2836
 2837
 2838
 2839
 2840
 2841
 2842
 2843
 2844
 2845
 2846
 2847
 2848
 2849
 2850
 2851
 2852
 2853
 2854
 2855
 2856
 2857
 2858
 2859
 2860
 2861
 2862
 2863
 2864
 2865
 2866
 2867
 2868
 2869
 2870
 2871
 2872
 2873
 2874
 2875
 2876
 2877
 2878
 2879
 2880
 2881
 2882
 2883
 2884
 2885
 2886
 2887
 2888
 2889
 2890
 2891
 2892
 2893
 2894
 2895
 2896
 2897
 2898
 2899
 2900
 2901
 2902
 2903
 2904
 2905
 2906
 2907
 2908
 2909
 2910
 2911
 2912
 2913
 2914
 2915
 2916
 2917
 2918
 2919
 2920
 2921
 2922
 2923
 2924
 2925
 2926
 2927
 2928
 2929
 2930
 2931
 2932
 2933
 2934
 2935
 2936
 2937
 2938
 2939
 2940
 2941
 2942
 2943
 2944
 2945
 2946
 2947
 2948
 2949
 2950
 2951
 2952
 2953
 2954
 2955
 2956
 2957
 2958
 2959
 2960
 2961
 2962
 2963
 2964
 2965
 2966
 2967
 2968
 2969
 2970
 2971
 2972
 2973
 2974
 2975
 2976
 2977
 2978
 2979
 2980
 2981
 2982
 2983
 2984
 2985
 2986
 2987
 2988
 2989
 2990
 2991
 2992
 2993
 2994
 2995
 2996
 2997
 2998
 2999
 3000
 3001
 3002
 3003
 3004
 3005
 3006
 3007
 3008
 3009
 3010
 3011
 3012
 3013
 3014
 3015
 3016
 3017
 3018
 3019
 3020
 3021
 3022
 3023
 3024
 3025
 3026
 3027
 3028
 3029
 3030
 3031
 3032
 3033
 3034
 3035
 3036
 3037
 3038
 3039
 3040
 3041
 3042
 3043
 3044
 3045
 3046
 3047
 3048
 3049
 3050
 3051
 3052
 3053
 3054
 3055
 3056
 3057
 3058
 3059
 3060
 3061
 3062
 3063
 3064
 3065
 3066
 3067
 3068
 3069
 3070
 3071
 3072
 3073
 3074
 3075
 3076
 3077
 3078
 3079
 3080
 3081
 3082
 3083
 3084
 3085
 3086
 3087
 3088
 3089
 3090
 3091
 3092
 3093
 3094
 3095
 3096
 3097
 3098
 3099
 3100
 3101
 3102
 3103
 3104
 3105
 3106
 3107
 3108
 3109
 3110
 3111
 3112
 3113
 3114
 3115
 3116
 3117
 3118
 3119
 3120
 3121
 3122
 3123
 3124
 3125
 3126
 3127
 3128
 3129
 3130
 3131
 3132
 3133
 3134
 3135
 3136
 3137
 3138
 3139
 3140
 3141
 3142
 3143
 3144
 3145
 3146
 3147
 3148
 3149
 3150
 3151
 3152
 3153
 3154
 3155
 3156
 3157
 3158
 3159
 3160
 3161
 3162
 3163
 3164
 3165
 3166
 3167
 3168
 3169
 3170
 3171
 3172
 3173
 3174
 3175
 3176
 3177
 3178
 3179
 3180
 3181
 3182
 3183
 3184
 3185
 3186
 3187
 3188
 3189
 3190
 3191
 3192
 3193
 3194
 3195
 3196
 3197
 3198
 3199
 3200
 3201
 3202
 3203
 3204
 3205
 3206
 3207
 3208
 3209
 3210
 3211
 3212
 3213
 3214
 3215
 3216
 3217
 3218
 3219
 3220
 3221
 3222
 3223
 3224
 3225
 3226
 3227
 3228
 3229
 3230
 3231
 3232
 3233
 3234
 3235
 3236
 3237
 3238
 3239
 3240
 3241
 3242
 3243
 3244
 3245
 3246
 3247
 3248
 3249
 3250
 3251
 3252
 3253
 3254
 3255
 3256
 3257
 3258
 3259
 3260
 3261
 3262
 3263
 3264
 3265
 3266
 3267
 3268
 3269
 3270
 3271
 3272
 3273
 3274
 3275
 3276
 3277
 3278
 3279
 3280
 3281
 3282
 3283
 3284
 3285
 3286
 3287
 3288
 3289
 3290
 3291
 3292
 3293
 3294
 3295
 3296
 3297
 3298
 3299
 3300
 3301
 3302
 3303
 3304
 3305
 3306
 3307
 3308
 3309
 3310
 3311
 3312
 3313
 3314
 3315
 3316
 3317
 3318
 3319
 3320
 3321
 3322
 3323
 3324
 3325
 3326
 3327
 3328
 3329
 3330
 3331
 3332
 3333
 3334
 3335
 3336
 3337
 3338
 3339
 3340
 3341
 3342
 3343
 3344
 3345
 3346
 3347
 3348
 3349
 3350
 3351
 3352
 3353
 3354
 3355
 3356
 3357
 3358
 3359
 3360
 3361
 3362
 3363
 3364
 3365
 3366
 3367
 3368
 3369
 3370
 3371
 3372
 3373
 3374
 3375
 3376
 3377
 3378
 3379
 3380
 3381
 3382
 3383
 3384
 3385
 3386
 3387
 3388
 3389
 3390
 3391
 3392
 3393
 3394
 3395
 3396
 3397
 3398
 3399
 3400
 3401
 3402
 3403
 3404
 3405
 3406
 3407
 3408
 3409
 3410
 3411
 3412
 3413
 3414
 3415
 3416
 3417
 3418
 3419
 3420
 3421
 3422
 3423
 3424
 3425
 3426
 3427
 3428
 3429
 3430
 3431
 3432
 3433
 3434
 3435
 3436
 3437
 3438
 3439
 3440
 3441
 3442
 3443
 3444
 3445
 3446
 3447
 3448
 3449
 3450
 3451
 3452
 3453
 3454
 3455
 3456
 3457
 3458
 3459
 3460
 3461
 3462
 3463
 3464
 3465
 3466
 3467
 3468
 3469
 3470
 3471
 3472
 3473
 3474
 3475
 3476
 3477
 3478
 3479
 3480
 3481
 3482
 3483
 3484
 3485
 3486
 3487
 3488
 3489
 3490
 3491
 3492
 3493
 3494
 3495
 3496
 3497
 3498
 3499
 3500
 3501
 3502
 3503
 3504
 3505
 3506
 3507
 3508
 3509
 3510
 3511
 3512
 3513
 3514
 3515
 3516
 3517
 3518
 3519
 3520
 3521
 3522
 3523
 3524
 3525
 3526
 3527
 3528
 3529
 3530
 3531
 3532
 3533
 3534
 3535
 3536
 3537
 3538
 3539
 3540
 3541
 3542
 3543
 3544
 3545
 3546
 3547
 3548
 3549
 3550
 3551
 3552
 3553
 3554
 3555
 3556
 3557
 3558
 3559
 3560
 3561
 3562
 3563
 3564
 3565
 3566
 3567
 3568
 3569
 3570
 3571
 3572
 3573
 3574
 3575
 3576
 3577
 3578
 3579
 3580
 3581
 3582
 3583
 3584
 3585
 3586
 3587
 3588
 3589
 3590
 3591
 3592
 3593
 3594
 3595
 3596
 3597
 3598
 3599
 3600
 3601
 3602
 3603
 3604
 3605
 3606
 3607
 3608
 3609
 3610
 3611
 3612
 3613
 3614
 3615
 3616
 3617
 3618
 3619
 3620
 3621
 3622
 3623
 3624
 3625
 3626
 3627
 3628
 3629
 3630
 3631
 3632
 3633
 3634
 3635
 3636
 3637
 3638
 3639
 3640
 3641
 3642
 3643
 3644
 3645
 3646
 3647
 3648
 3649
 3650
 3651
 3652
 3653
 3654
 3655
 3656
 3657
 3658
 3659
 3660
 3661
 3662
 3663
 3664
 3665
 3666
 3667
 3668
 3669
 3670
 3671
 3672
 3673
 3674
 3675
 3676
 3677
 3678
 3679
 3680
 3681
 3682
 3683
 3684
 3685
 3686
 3687
 3688
 3689
 3690
 3691
 3692
 3693
 3694
 3695
 3696
 3697
 3698
 3699
 3700
 3701
 3702
 3703
 3704
 3705
 3706
 3707
 3708
 3709
 3710
 3711
 3712
 3713
 3714
 3715
 3716
 3717
 3718
 3719
 3720
 3721
 3722
 3723
 3724
 3725
 3726
 3727
 3728
 3729
 3730
 3731
 3732
 3733
 3734
 3735
 3736
 3737
 3738
 3739
 3740
 3741
 3742
 3743
 3744
 3745
 3746
 3747
 3748
 3749
 3750
 3751
 3752
 3753
 3754
 3755
 3756
 3757
 3758
 3759
 3760
 3761
 3762
 3763
 3764
 3765
 3766
 3767
 3768
 3769
 3770
 3771
 3772
 3773
 3774
 3775
 3776
 3777
 3778
 3779
 3780
 3781
 3782
 3783
 3784
 3785
 3786
 3787
 3788
 3789
 3790
 3791
 3792
 3793
 3794
 3795
 3796
 3797
 3798
 3799
 3800
 3801
 3802
 3803
 3804
 3805
 3806
 3807
 3808
 3809
 3810
 3811
 3812
 3813
 3814
 3815
 3816
 3817
 3818
 3819
 3820
 3821
 3822
 3823
 3824
 3825
 3826
 3827
 3828
 3829
 3830
 3831
 3832
 3833
 3834
 3835
 3836
 3837
 3838
 3839
 3840
 3841
 3842
 3843
 3844
 3845
 3846
 3847
 3848
 3849
 3850
 3851
 3852
 3853
 3854
 3855
 3856
 3857
 3858
 3859
 3860
 3861
 3862
 3863
 3864
 3865
 3866
 3867
 3868
 3869
 3870
 3871
 3872
 3873
 3874
 3875
 3876
 3877
 3878
 3879
 3880
 3881
 3882
 3883
 3884
 3885
 3886
 3887
 3888
 3889
 3890
 3891
 3892
 3893
 3894
 3895
 3896
 3897
 3898
 3899
 3900
 3901
 3902
 3903
 3904
 3905
 3906
 3907
 3908
 3909
 3910
 3911
 3912
 3913
 3914
 3915
 3916
 3917
 3918
 3919
 3920
 3921
 3922
 3923
 3924
 3925
 3926
 3927
 3928
 3929
 3930
 3931
 3932
 3933
 3934
 3935
 3936
 3937
 3938
 3939
 3940
 3941
 3942
 3943
 3944
 3945
 3946
 3947
 3948
 3949
 3950
 3951
 3952
 3953
 3954
 3955
 3956
 3957
 3958
 3959
 3960
 3961
 3962
 3963
 3964
 3965
 3966
 3967
 3968
 3969
 3970
 3971
 3972
 3973
 3974
 3975
 3976
 3977
 3978
 3979
 3980
 3981
 3982
 3983
 3984
 3985
 3986
 3987
 3988
 3989
 3990
 3991
 3992
 3993
 3994
 3995
 3996
 3997
 3998
 3999
 4000
 4001
 4002
 4003
 4004
 4005
 4006
 4007
 4008
 4009
 4010
 4011
 4012
 4013
 4014
 4015
 4016
 4017
 4018
 4019
 4020
 4021
 4022
 4023
 4024
 4025
 4026
 4027
 4028
 4029
 4030
 4031
 4032
 4033
 4034
 4035
 4036
 4037
 4038
 4039
 4040
 4041
 4042
 4043
 4044
 4045
 4046
 4047
 4048
 4049
 4050
 4051
 4052
 4053
 4054
 4055
 4056
 4057
 4058
 4059
 4060
 4061
 4062
 4063
 4064
 4065
 4066
 4067
 4068
 4069
 4070
 4071
 4072
 4073
 4074
 4075
 4076
 4077
 4078
 4079
 4080
 4081
 4082
 4083
 4084
 4085
 4086
 4087
 4088
 4089
 4090
 4091
 4092
 4093
 4094
 4095
 4096
 4097
 4098
 4099
 4100
 4101
 4102
 4103
 4104
 4105
 4106
 4107
 4108
 4109
 4110
 4111
 4112
 4113
 4114
 4115
 4116
 4117
 4118
 4119
 4120
 4121
 4122
 4123
 4124
 4125
 4126
 4127
 4128
 4129
 4130
 4131
 4132
 4133
 4134
 4135
 4136
 4137
 4138
 4139
 4140
 4141
 4142
 4143
 4144
 4145
 4146
 4147
 4148
 4149
 4150
 4151
 4152
 4153
 4154
 4155
 4156
 4157
 4158
 4159
 4160
 4161
 4162
 4163
 4164
 4165
 4166
 4167
 4168
 4169
 4170
 4171
 4172
 4173
 4174
 4175
 4176
 4177
 4178
 4179
 4180
 4181
 4182
 4183
 4184
 4185
 4186
 4187
 4188
 4189
 4190
 4191
 4192
 4193
 4194
 4195
 4196
 4197
 4198
 4199
 4200
 4201
 4202
 4203
 4204
 4205
 4206
 4207
 4208
 4209
 4210
 4211
 4212
 4213
 4214
 4215
 4216
 4217
 4218
 4219
 4220
 4221
 4222
 4223
 4224
 4225
 4226
 4227
 4228
 4229
 4230
 4231
 4232
 4233
 4234
 4235
 4236
 4237
 4238
 4239
 4240
 4241
 4242
 4243
 4244
 4245
 4246
 4247
 4248
 4249
 4250
 4251
 4252
 4253
 4254
 4255
 4256
 4257
 4258
 4259
 4260
 4261
 4262
 4263
 4264
 4265
 4266
 4267
 4268
 4269
 4270
 4271
 4272
 4273
 4274
 4275
 4276
 4277
 4278
 4279
 4280
 4281
 4282
 4283
 4284
 4285
 4286
 4287
 4288
 4289
 4290
 4291
 4292
 4293
 4294
 4295
 4296
 4297
 4298
 4299
 4300
 4301
 4302
 4303
 4304
 4305
 4306
 4307
 4308
 4309
 4310
 4311
 4312
 4313
 4314
 4315
 4316
 4317
 4318
 4319
 4320
 4321
 4322
 4323
 4324
 4325
 4326
 4327
 4328
 4329
 4330
 4331
 4332
 4333
 4334
 4335
 4336
 4337
 4338
 4339
 4340
 4341
 4342
 4343
 4344
 4345
 4346
 4347
 4348
 4349
 4350
 4351
 4352
 4353
 4354
 4355
 4356
 4357
 4358
 4359
 4360
 4361
 4362
 4363
 4364
 4365
 4366
 4367
 4368
 4369
 4370
 4371
 4372
 4373
 4374
 4375
 4376
 4377
 4378
 4379
 4380
 4381
 4382
 4383
 4384
 4385
 4386
 4387
 4388
 4389
 4390
 4391
 4392
 4393
 4394
 4395
 4396
 4397
 4398
 4399
 4400
 4401
 4402
 4403
 4404
 4405
 4406
 4407
 4408
 4409
 4410
 4411
 4412
 4413
 4414
 4415
 4416
 4417
 4418
 4419
 4420
 4421
 4422
 4423
 4424
 4425
 4426
 4427
 4428
 4429
 4430
 4431
 4432
 4433
 4434
 4435
 4436
 4437
 4438
 4439
 4440
 4441
 4442
 4443
 4444
 4445
 4446
 4447
 4448
 4449
 4450
 4451
 4452
 4453
 4454
 4455
 4456
 4457
 4458
 4459
 4460
 4461
 4462
 4463
 4464
 4465
 4466
 4467
 4468
 4469
 4470
 4471
 4472
 4473
 4474
 4475
 4476
 4477
 4478
 4479
 4480
 4481
 4482
 4483
 4484
 4485
 4486
 4487
 4488
 4489
 4490
 4491
 4492
 4493
 4494
 4495
 4496
 4497
 4498
 4499
 4500
 4501
 4502
 4503
 4504
 4505
 4506
 4507
 4508
 4509
 4510
 4511
 4512
 4513
 4514
 4515
 4516
 4517
 4518
 4519
 4520
 4521
 4522
 4523
 4524
 4525
 4526
 4527
 4528
 4529
 4530
 4531
 4532
 4533
 4534
 4535
 4536
 4537
 4538
 4539
 4540
 4541
 4542
 4543
 4544
 4545
 4546
 4547
 4548
 4549
 4550
 4551
 4552
 4553
 4554
 4555
 4556
 4557
 4558
 4559
 4560
 4561
 4562
 4563
 4564
 4565
 4566
 4567
 4568
 4569
 4570
 4571
 4572
 4573
 4574
 4575
 4576
 4577
 4578
 4579
 4580
 4581
 4582
 4583
 4584
 4585
 4586
 4587
 4588
 4589
 4590
 4591
 4592
 4593
 4594
 4595
 4596
 4597
 4598
 4599
 4600
 4601
 4602
 4603
 4604
 4605
 4606
 4607
 4608
 4609
 4610
 4611
 4612
 4613
 4614
 4615
 4616
 4617
 4618
 4619
 4620
 4621
 4622
 4623
 4624
 4625
 4626
 4627
 4628
 4629
 4630
 4631
 4632
 4633
 4634
 4635
 4636
 4637
 4638
 4639
 4640
 4641
 4642
 4643
 4644
 4645
 4646
 4647
 4648
 4649
 4650
 4651
 4652
 4653
 4654
 4655
 4656
 4657
 4658
 4659
 4660
 4661
 4662
 4663
 4664
 4665
 4666
 4667
 4668
 4669
 4670
 4671
 4672
 4673
 4674
 4675
 4676
 4677
 4678
 4679
 4680
 4681
 4682
 4683
 4684
 4685
 4686
 4687
 4688
 4689
 4690
 4691
 4692
 4693
 4694
 4695
 4696
 4697
 4698
 4699
 4700
 4701
 4702
 4703
 4704
 4705
 4706
 4707
 4708
 4709
 4710
 4711
 4712
 4713
 4714
 4715
 4716
 4717
 4718
 4719
 4720
 4721
 4722
 4723
 4724
 4725
 4726
 4727
 4728
 4729
 4730
 4731
 4732
 4733
 4734
 4735
 4736
 4737
 4738
 4739
 4740
 4741
 4742
 4743
 4744
 4745
 4746
 4747
 4748
 4749
 4750
 4751
 4752
 4753
 4754
 4755
 4756
 4757
 4758
 4759
 4760
 4761
 4762
 4763
 4764
 4765
 4766
 4767
 4768
 4769
 4770
 4771
 4772
 4773
 4774
 4775
 4776
 4777
 4778
 4779
 4780
 4781
 4782
 4783
 4784
 4785
 4786
 4787
 4788
 4789
 4790
 4791
 4792
 4793
 4794
 4795
 4796
 4797
 4798
 4799
 4800
 4801
 4802
 4803
 4804
 4805
 4806
 4807
 4808
 4809
 4810
 4811
 4812
 4813
 4814
 4815
 4816
 4817
 4818
 4819
 4820
 4821
 4822
 4823
 4824
 4825
 4826
 4827
 4828
 4829
 4830
 4831
 4832
 4833
 4834
 4835
 4836
 4837
 4838
 4839
 4840
 4841
 4842
 4843
 4844
 4845
 4846
 4847
 4848
 4849
 4850
 4851
 4852
 4853
 4854
 4855
 4856
 4857
 4858
 4859
 4860
 4861
 4862
 4863
 4864
 4865
 4866
 4867
 4868
 4869
 4870
 4871
 4872
 4873
 4874
 4875
 4876
 4877
 4878
 4879
 4880
 4881
 4882
 4883
 4884
 4885
 4886
 4887
 4888
 4889
 4890
 4891
 4892
 4893
 4894
 4895
 4896
 4897
 4898
 4899
 4900
 4901
 4902
 4903
 4904
 4905
 4906
 4907
 4908
 4909
 4910
 4911
 4912
 4913
 4914
 4915
 4916
 4917
 4918
 4919
 4920
 4921
 4922
 4923
 4924
 4925
 4926
 4927
 4928
 4929
 4930
 4931
 4932
 4933
 4934
 4935
 4936
 4937
 4938
 4939
 4940
 4941
 4942
 4943
 4944
 4945
 4946
 4947
 4948
 4949
 4950
 4951
 4952
 4953
 4954
 4955
 4956
 4957
 4958
 4959
 4960
 4961
 4962
 4963
 4964
 4965
 4966
 4967
 4968
 4969
 4970
 4971
 4972
 4973
 4974
 4975
 4976
 4977
 4978
 4979
 4980
 4981
 4982
 4983
 4984
 4985
 4986
 4987
 4988
 4989
 4990
 4991
 4992
 4993
 4994
 4995
 4996
 4997
 4998
 4999
 5000
 5001
 5002
 5003
 5004
 5005
 5006
 5007
 5008
 5009
 5010
 5011
 5012
 5013
 5014
 5015
 5016
 5017
 5018
 5019
 5020
 5021
 5022
 5023
 5024
 5025
 5026
 5027
 5028
 5029
 5030
 5031
 5032
 5033
 5034
 5035
 5036
 5037
 5038
 5039
 5040
 5041
 5042
 5043
 5044
 5045
 5046
 5047
 5048
 5049
 5050
 5051
 5052
 5053
 5054
 5055
 5056
 5057
 5058
 5059
 5060
 5061
 5062
 5063
 5064
 5065
 5066
 5067
 5068
 5069
 5070
 5071
 5072
 5073
 5074
 5075
 5076
 5077
 5078
 5079
 5080
 5081
 5082
 5083
 5084
 5085
 5086
 5087
 5088
 5089
 5090
 5091
 5092
 5093
 5094
 5095
 5096
 5097
 5098
 5099
 5100
 5101
 5102
 5103
 5104
 5105
 5106
 5107
 5108
 5109
 5110
 5111
 5112
 5113
 5114
 5115
 5116
 5117
 5118
 5119
 5120
 5121
 5122
 5123
 5124
 5125
 5126
 5127
 5128
 5129
 5130
 5131
 5132
 5133
 5134
 5135
 5136
 5137
 5138
 5139
 5140
 5141
 5142
 5143
 5144
 5145
 5146
 5147
 5148
 5149
 5150
 5151
 5152
 5153
 5154
 5155
 5156
 5157
 5158
 5159
 5160
 5161
 5162
 5163
 5164
 5165
 5166
 5167
 5168
 5169
 5170
 5171
 5172
 5173
 5174
 5175
 5176
 5177
 5178
 5179
 5180
 5181
 5182
 5183
 5184
 5185
 5186
 5187
 5188
 5189
 5190
 5191
 5192
 5193
 5194
 5195
 5196
 5197
 5198
 5199
 5200
 5201
 5202
 5203
 5204
 5205
 5206
 5207
 5208
 5209
 5210
 5211
 5212
 5213
 5214
 5215
 5216
 5217
 5218
 5219
 5220
 5221
 5222
 5223
 5224
 5225
 5226
 5227
 5228
 5229
 5230
 5231
 5232
 5233
 5234
 5235
 5236
 5237
 5238
 5239
 5240
 5241
 5242
 5243
 5244
 5245
 5246
 5247
 5248
 5249
 5250
 5251
 5252
 5253
 5254
 5255
 5256
 5257
 5258
 5259
 5260
 5261
 5262
 5263
 5264
 5265
 5266
 5267
 5268
 5269
 5270
 5271
 5272
 5273
 5274
 5275
 5276
 5277
 5278
 5279
 5280
 5281
 5282
 5283
 5284
 5285
 5286
 5287
 5288
 5289
 5290
 5291
 5292
 5293
 5294
 5295
 5296
 5297
 5298
 5299
 5300
 5301
 5302
 5303
 5304
 5305
 5306
 5307
 5308
 5309
 5310
 5311
 5312
 5313
 5314
 5315
 5316
 5317
 5318
 5319
 5320
 5321
 5322
 5323
 5324
 5325
 5326
 5327
 5328
 5329
 5330
 5331
 5332
 5333
 5334
 5335
 5336
 5337
 5338
 5339
 5340
 5341
 5342
 5343
 5344
 5345
 5346
 5347
 5348
 5349
 5350
 5351
 5352
 5353
 5354
 5355
 5356
 5357
 5358
 5359
 5360
 5361
 5362
 5363
 5364
 5365
 5366
 5367
 5368
 5369
 5370
 5371
 5372
 5373
 5374
 5375
 5376
 5377
 5378
 5379
 5380
 5381
 5382
 5383
 5384
 5385
 5386
 5387
 5388
 5389
 5390
 5391
 5392
 5393
 5394
 5395
 5396
 5397
 5398
 5399
 5400
 5401
 5402
 5403
 5404
 5405
 5406
 5407
 5408
 5409
 5410
 5411
 5412
 5413
 5414
 5415
 5416
 5417
 5418
 5419
 5420
 5421
 5422
 5423
 5424
 5425
 5426
 5427
 5428
 5429
 5430
 5431
 5432
 5433
 5434
 5435
 5436
 5437
 5438
 5439
 5440
 5441
 5442
 5443
 5444
 5445
 5446
 5447
 5448
 5449
 5450
 5451
 5452
 5453
 5454
 5455
 5456
 5457
 5458
 5459
 5460
 5461
 5462
 5463
 5464
 5465
 5466
 5467
 5468
 5469
 5470
 5471
 5472
 5473
 5474
 5475
 5476
 5477
 5478
 5479
 5480
 5481
 5482
 5483
 5484
 5485
 5486
 5487
 5488
 5489
 5490
 5491
 5492
 5493
 5494
 5495
 5496
 5497
 5498
 5499
 5500
 5501
 5502
 5503
 5504
 5505
 5506
 5507
 5508
 5509
 5510
 5511
 5512
 5513
 5514
 5515
 5516
 5517
 5518
 5519
 5520
 5521
 5522
 5523
 5524
 5525
 5526
 5527
 5528
 5529
 5530
 5531
 5532
 5533
 5534
 5535
 5536
 5537
 5538
 5539
 5540
 5541
 5542
 5543
 5544
 5545
 5546
 5547
 5548
 5549
 5550
 5551
 5552
 5553
 5554
 5555
 5556
 5557
 5558
 5559
 5560
 5561
 5562
 5563
 5564
 5565
 5566
 5567
 5568
 5569
 5570
 5571
 5572
 5573
 5574
 5575
 5576
 5577
 5578
 5579
 5580
 5581
 5582
 5583
 5584
 5585
 5586
 5587
 5588
 5589
 5590
 5591
 5592
 5593
 5594
 5595
 5596
 5597
 5598
 5599
 5600
 5601
 5602
 5603
 5604
 5605
 5606
 5607
 5608
 5609
 5610
 5611
 5612
 5613
 5614
 5615
 5616
 5617
 5618
 5619
 5620
 5621
 5622
 5623
 5624
 5625
 5626
 5627
 5628
 5629
 5630
 5631
 5632
 5633
 5634
 5635
 5636
 5637
 5638
 5639
 5640
 5641
 5642
 5643
 5644
 5645
 5646
 5647
 5648
 5649
 5650
 5651
 5652
 5653
 5654
 5655
 5656
 5657
 5658
 5659
 5660
 5661
 5662
 5663
 5664
 5665
 5666
 5667
 5668
 5669
 5670
 5671
 5672
 5673
 5674
 5675
 5676
 5677
 5678
 5679
 5680
 5681
 5682
 5683
 5684
 5685
 5686
 5687
 5688
 5689
 5690
 5691
 5692
 5693
 5694
 5695
 5696
 5697
 5698
 5699
 5700
 5701
 5702
 5703
 5704
 5705
 5706
 5707
 5708
 5709
 5710
 5711
 5712
 5713
 5714
 5715
 5716
 5717
 5718
 5719
 5720
 5721
 5722
 5723
 5724
 5725
 5726
 5727
 5728
 5729
 5730
 5731
 5732
 5733
 5734
 5735
 5736
 5737
 5738
 5739
 5740
 5741
 5742
 5743
 5744
 5745
 5746
 5747
 5748
 5749
 5750
 5751
 5752
 5753
 5754
 5755
 5756
 5757
 5758
 5759
 5760
 5761
 5762
 5763
 5764
 5765
 5766
 5767
 5768
 5769
 5770
 5771
 5772
 5773
 5774
 5775
 5776
 5777
 5778
 5779
 5780
 5781
 5782
 5783
 5784
 5785
 5786
 5787
 5788
 5789
 5790
 5791
 5792
 5793
 5794
 5795
 5796
 5797
 5798
 5799
 5800
 5801
 5802
 5803
 5804
 5805
 5806
 5807
 5808
 5809
 5810
 5811
 5812
 5813
 5814
 5815
 5816
 5817
 5818
 5819
 5820
 5821
 5822
 5823
 5824
 5825
 5826
 5827
 5828
 5829
 5830
 5831
 5832
 5833
 5834
 5835
 5836
 5837
 5838
 5839
 5840
 5841
 5842
 5843
 5844
 5845
 5846
 5847
 5848
 5849
 5850
 5851
 5852
 5853
 5854
 5855
 5856
 5857
 5858
 5859
 5860
 5861
 5862
 5863
 5864
 5865
 5866
 5867
 5868
 5869
 5870
 5871
 5872
 5873
 5874
 5875
 5876
 5877
 5878
 5879
 5880
 5881
 5882
 5883
 5884
 5885
 5886
 5887
 5888
 5889
 5890
 5891
 5892
 5893
 5894
 5895
 5896
 5897
 5898
 5899
 5900
 5901
 5902
 5903
 5904
 5905
 5906
 5907
 5908
 5909
 5910
 5911
 5912
 5913
 5914
 5915
 5916
 5917
 5918
 5919
 5920
 5921
 5922
 5923
 5924
 5925
 5926
 5927
 5928
 5929
 5930
 5931
 5932
 5933
 5934
 5935
 5936
 5937
 5938
 5939
 5940
 5941
 5942
 5943
 5944
 5945
 5946
 5947
 5948
 5949
 5950
 5951
 5952
 5953
 5954
 5955
 5956
 5957
 5958
 5959
 5960
 5961
 5962
 5963
 5964
 5965
 5966
 5967
 5968
 5969
 5970
 5971
 5972
 5973
 5974
 5975
 5976
 5977
 5978
 5979
 5980
 5981
 5982
 5983
 5984
 5985
 5986
 5987
 5988
 5989
 5990
 5991
 5992
 5993
 5994
 5995
 5996
 5997
 5998
 5999
 6000
 6001
 6002
 6003
 6004
 6005
 6006
 6007
 6008
 6009
 6010
 6011
 6012
 6013
 6014
 6015
 6016
 6017
 6018
 6019
 6020
 6021
 6022
 6023
 6024
 6025
 6026
 6027
 6028
 6029
 6030
 6031
 6032
 6033
 6034
 6035
 6036
 6037
 6038
 6039
 6040
 6041
 6042
 6043
 6044
 6045
 6046
 6047
 6048
 6049
 6050
 6051
 6052
 6053
 6054
 6055
 6056
 6057
 6058
 6059
 6060
 6061
 6062
 6063
 6064
 6065
 6066
 6067
 6068
 6069
 6070
 6071
 6072
 6073
 6074
 6075
 6076
 6077
 6078
 6079
 6080
 6081
 6082
 6083
 6084
 6085
 6086
 6087
 6088
 6089
 6090
 6091
 6092
 6093
 6094
 6095
 6096
 6097
 6098
 6099
 6100
 6101
 6102
 6103
 6104
 6105
 6106
 6107
 6108
 6109
 6110
 6111
 6112
 6113
 6114
 6115
 6116
 6117
 6118
 6119
 6120
 6121
 6122
 6123
 6124
 6125
 6126
 6127
 6128
 6129
 6130
 6131
 6132
 6133
 6134
 6135
 6136
 6137
 6138
 6139
 6140
 6141
 6142
 6143
 6144
 6145
 6146
 6147
 6148
 6149
 6150
 6151
 6152
 6153
 6154
 6155
 6156
 6157
 6158
 6159
 6160
 6161
 6162
 6163
 6164
 6165
 6166
 6167
 6168
 6169
 6170
 6171
 6172
 6173
 6174
 6175
 6176
 6177
 6178
 6179
 6180
 6181
 6182
 6183
 6184
 6185
 6186
 6187
 6188
 6189
 6190
 6191
 6192
 6193
 6194
 6195
 6196
 6197
 6198
 6199
 6200
 6201
 6202
 6203
 6204
 6205
 6206
 6207
 6208
 6209
 6210
 6211
 6212
 6213
 6214
 6215
 6216
 6217
 6218
 6219
 6220
 6221
 6222
 6223
 6224
 6225
 6226
 6227
 6228
 6229
 6230
 6231
 6232
 6233
 6234
 6235
 6236
 6237
 6238
 6239
 6240
 6241
 6242
 6243
 6244
 6245
 6246
 6247
 6248
 6249
 6250
 6251
 6252
 6253
 6254
 6255
 6256
 6257
 6258
 6259
 6260
 6261
 6262
 6263
 6264
 6265
 6266
 6267
 6268
 6269
 6270
 6271
 6272
 6273
 6274
 6275
 6276
 6277
 6278
 6279
 6280
 6281
 6282
 6283
 6284
 6285
 6286
 6287
 6288
 6289
 6290
 6291
 6292
 6293
 6294
 6295
 6296
 6297
 6298
 6299
 6300
 6301
 6302
 6303
 6304
 6305
 6306
 6307
 6308
 6309
 6310
 6311
 6312
 6313
 6314
 6315
 6316
 6317
 6318
 6319
 6320
 6321
 6322
 6323
 6324
 6325
 6326
 6327
 6328
 6329
 6330
 6331
 6332
 6333
 6334
 6335
 6336
 6337
 6338
 6339
 6340
 6341
 6342
 6343
 6344
 6345
 6346
 6347
 6348
 6349
 6350
 6351
 6352
 6353
 6354
 6355
 6356
 6357
 6358
 6359
 6360
 6361
 6362
 6363
 6364
 6365
 6366
 6367
 6368
 6369
 6370
 6371
 6372
 6373
 6374
 6375
 6376
 6377
 6378
 6379
 6380
 6381
 6382
 6383
 6384
 6385
 6386
 6387
 6388
 6389
 6390
 6391
 6392
 6393
 6394
 6395
 6396
 6397
 6398
 6399
 6400
 6401
 6402
 6403
 6404
 6405
 6406
 6407
 6408
 6409
 6410
 6411
 6412
 6413
 6414
 6415
 6416
 6417
 6418
 6419
 6420
 6421
 6422
 6423
 6424
 6425
 6426
 6427
 6428
 6429
 6430
 6431
 6432
 6433
 6434
 6435
 6436
 6437
 6438
 6439
 6440
 6441
 6442
 6443
 6444
 6445
 6446
 6447
 6448
 6449
 6450
 6451
 6452
 6453
 6454
 6455
 6456
 6457
 6458
 6459
 6460
 6461
 6462
 6463
 6464
 6465
 6466
 6467
 6468
 6469
 6470
 6471
 6472
 6473
 6474
 6475
 6476
 6477
 6478
 6479
 6480
 6481
 6482
 6483
 6484
 6485
 6486
 6487
 6488
 6489
 6490
 6491
 6492
 6493
 6494
 6495
 6496
 6497
 6498
 6499
 6500
 6501
 6502
 6503
 6504
 6505
 6506
 6507
 6508
 6509
 6510
 6511
 6512
 6513
 6514
 6515
 6516
 6517
 6518
 6519
 6520
 6521
 6522
 6523
 6524
 6525
 6526
 6527
 6528
 6529
 6530
 6531
 6532
 6533
 6534
 6535
 6536
 6537
 6538
 6539
 6540
 6541
 6542
 6543
 6544
 6545
 6546
 6547
 6548
 6549
 6550
 6551
 6552
 6553
 6554
 6555
 6556
 6557
 6558
 6559
 6560
 6561
 6562
 6563
 6564
 6565
 6566
 6567
 6568
 6569
 6570
 6571
 6572
 6573
 6574
 6575
 6576
 6577
 6578
 6579
 6580
 6581
 6582
 6583
 6584
 6585
 6586
 6587
 6588
 6589
 6590
 6591
 6592
 6593
 6594
 6595
 6596
 6597
 6598
 6599
 6600
 6601
 6602
 6603
 6604
 6605
 6606
 6607
 6608
 6609
 6610
 6611
 6612
 6613
 6614
 6615
 6616
 6617
 6618
 6619
 6620
 6621
 6622
 6623
 6624
 6625
 6626
 6627
 6628
 6629
 6630
 6631
 6632
 6633
 6634
 6635
 6636
 6637
 6638
 6639
 6640
 6641
 6642
 6643
 6644
 6645
 6646
 6647
 6648
 6649
 6650
 6651
 6652
 6653
 6654
 6655
 6656
 6657
 6658
 6659
 6660
 6661
 6662
 6663
 6664
 6665
 6666
 6667
 6668
 6669
 6670
 6671
 6672
 6673
 6674
 6675
 6676
 6677
 6678
 6679
 6680
 6681
 6682
 6683
 6684
 6685
 6686
 6687
 6688
 6689
 6690
 6691
 6692
 6693
 6694
 6695
 6696
 6697
 6698
 6699
 6700
 6701
 6702
 6703
 6704
 6705
 6706
 6707
 6708
 6709
 6710
 6711
 6712
 6713
 6714
 6715
 6716
 6717
 6718
 6719
 6720
 6721
 6722
 6723
 6724
 6725
 6726
 6727
 6728
 6729
 6730
 6731
 6732
 6733
 6734
 6735
 6736
 6737
 6738
 6739
 6740
 6741
 6742
 6743
 6744
 6745
 6746
 6747
 6748
 6749
 6750
 6751
 6752
 6753
 6754
 6755
 6756
 6757
 6758
 6759
 6760
 6761
 6762
 6763
 6764
 6765
 6766
 6767
 6768
 6769
 6770
 6771
 6772
 6773
 6774
 6775
 6776
 6777
 6778
 6779
 6780
 6781
 6782
 6783
 6784
 6785
 6786
 6787
 6788
 6789
 6790
 6791
 6792
 6793
 6794
 6795
 6796
 6797
 6798
 6799
 6800
 6801
 6802
 6803
 6804
 6805
 6806
 6807
 6808
 6809
 6810
 6811
 6812
 6813
 6814
 6815
 6816
 6817
 6818
 6819
 6820
 6821
 6822
 6823
 6824
 6825
 6826
 6827
 6828
 6829
 6830
 6831
 6832
 6833
 6834
 6835
 6836
 6837
 6838
 6839
 6840
 6841
 6842
 6843
 6844
 6845
 6846
 6847
 6848
 6849
 6850
 6851
 6852
 6853
 6854
 6855
 6856
 6857
 6858
 6859
 6860
 6861
 6862
 6863
 6864
 6865
 6866
 6867
 6868
 6869
 6870
 6871
 6872
 6873
 6874
 6875
 6876
 6877
 6878
 6879
 6880
 6881
 6882
 6883
 6884
 6885
 6886
 6887
 6888
 6889
 6890
 6891
 6892
 6893
 6894
 6895
 6896
 6897
 6898
 6899
 6900
 6901
 6902
 6903
 6904
 6905
 6906
 6907
 6908
 6909
 6910
 6911
 6912
 6913
 6914
 6915
 6916
 6917
 6918
 6919
 6920
 6921
 6922
 6923
 6924
 6925
 6926
 6927
 6928
 6929
 6930
 6931
 6932
 6933
 6934
 6935
 6936
 6937
 6938
 6939
 6940
 6941
 6942
 6943
 6944
 6945
 6946
 6947
 6948
 6949
 6950
 6951
 6952
 6953
 6954
 6955
 6956
 6957
 6958
 6959
 6960
 6961
 6962
 6963
 6964
 6965
 6966
 6967
 6968
 6969
 6970
 6971
 6972
 6973
 6974
 6975
 6976
 6977
 6978
 6979
 6980
 6981
 6982
 6983
 6984
 6985
 6986
 6987
 6988
 6989
 6990
 6991
 6992
 6993
 6994
 6995
 6996
 6997
 6998
 6999
 7000
 7001
 7002
 7003
 7004
 7005
 7006
 7007
 7008
 7009
 7010
 7011
 7012
 7013
 7014
 7015
 7016
 7017
 7018
 7019
 7020
 7021
 7022
 7023
 7024
 7025
 7026
 7027
 7028
 7029
 7030
 7031
 7032
 7033
 7034
 7035
 7036
 7037
 7038
 7039
 7040
 7041
 7042
 7043
 7044
 7045
 7046
 7047
 7048
 7049
 7050
 7051
 7052
 7053
 7054
 7055
 7056
 7057
 7058
 7059
 7060
 7061
 7062
 7063
 7064
 7065
 7066
 7067
 7068
 7069
 7070
 7071
 7072
 7073
 7074
 7075
 7076
 7077
 7078
 7079
 7080
 7081
 7082
 7083
 7084
 7085
 7086
 7087
 7088
 7089
 7090
 7091
 7092
 7093
 7094
 7095
 7096
 7097
 7098
 7099
 7100
 7101
 7102
 7103
 7104
 7105
 7106
 7107
 7108
 7109
 7110
 7111
 7112
 7113
 7114
 7115
 7116
 7117
 7118
 7119
 7120
 7121
 7122
 7123
 7124
 7125
 7126
 7127
 7128
 7129
 7130
 7131
 7132
 7133
 7134
 7135
 7136
 7137
 7138
 7139
 7140
 7141
 7142
 7143
 7144
 7145
 7146
 7147
 7148
 7149
 7150
 7151
 7152
 7153
 7154
 7155
 7156
 7157
 7158
 7159
 7160
 7161
 7162
 7163
 7164
 7165
 7166
 7167
 7168
 7169
 7170
 7171
 7172
 7173
 7174
 7175
 7176
 7177
 7178
 7179
 7180
 7181
 7182
 7183
 7184
 7185
 7186
 7187
 7188
 7189
 7190
 7191
 7192
 7193
 7194
 7195
 7196
 7197
 7198
 7199
 7200
 7201
 7202
 7203
 7204
 7205
 7206
 7207
 7208
 7209
 7210
 7211
 7212
 7213
 7214
 7215
 7216
 7217
 7218
 7219
 7220
 7221
 7222
 7223
 7224
 7225
 7226
 7227
 7228
 7229
 7230
 7231
 7232
 7233
 7234
 7235
 7236
 7237
 7238
 7239
 7240
 7241
 7242
 7243
 7244
 7245
 7246
 7247
 7248
 7249
 7250
 7251
 7252
 7253
 7254
 7255
 7256
 7257
 7258
 7259
 7260
 7261
 7262
 7263
 7264
 7265
 7266
 7267
 7268
 7269
 7270
 7271
 7272
 7273
 7274
 7275
 7276
 7277
 7278
 7279
 7280
 7281
 7282
 7283
 7284
 7285
 7286
 7287
 7288
 7289
 7290
 7291
 7292
 7293
 7294
 7295
 7296
 7297
 7298
 7299
 7300
 7301
 7302
 7303
 7304
 7305
 7306
 7307
 7308
 7309
 7310
 7311
 7312
 7313
 7314
 7315
 7316
 7317
 7318
 7319
 7320
 7321
 7322
 7323
 7324
 7325
 7326
 7327
 7328
 7329
 7330
 7331
 7332
 7333
 7334
 7335
 7336
 7337
 7338
 7339
 7340
 7341
 7342
 7343
 7344
 7345
 7346
 7347
 7348
 7349
 7350
 7351
 7352
 7353
 7354
 7355
 7356
 7357
 7358
 7359
 7360
 7361
 7362
 7363
 7364
 7365
 7366
 7367
 7368
 7369
 7370
 7371
 7372
 7373
 7374
 7375
 7376
 7377
 7378
 7379
 7380
 7381
 7382
 7383
 7384
 7385
 7386
 7387
 7388
 7389
 7390
 7391
 7392
 7393
 7394
 7395
 7396
 7397
 7398
 7399
 7400
 7401
 7402
 7403
 7404
 7405
 7406
 7407
 7408
 7409
 7410
 7411
 7412
 7413
 7414
 7415
 7416
 7417
 7418
 7419
 7420
 7421
 7422
 7423
 7424
 7425
 7426
 7427
 7428
 7429
 7430
 7431
 7432
 7433
 7434
 7435
 7436
 7437
 7438
 7439
 7440
 7441
 7442
 7443
 7444
 7445
 7446
 7447
 7448
 7449
 7450
 7451
 7452
 7453
 7454
 7455
 7456
 7457
 7458
 7459
 7460
 7461
 7462
 7463
 7464
 7465
 7466
 7467
 7468
 7469
 7470
 7471
 7472
 7473
 7474
 7475
 7476
 7477
 7478
 7479
 7480
 7481
 7482
 7483
 7484
 7485
 7486
 7487
 7488
 7489
 7490
 7491
 7492
 7493
 7494
 7495
 7496
 7497
 7498
 7499
 7500
 7501
 7502
 7503
 7504
 7505
 7506
 7507
 7508
 7509
 7510
 7511
 7512
 7513
 7514
 7515
 7516
 7517
 7518
 7519
 7520
 7521
 7522
 7523
 7524
 7525
 7526
 7527
 7528
 7529
 7530
 7531
 7532
 7533
 7534
 7535
 7536
 7537
 7538
 7539
 7540
 7541
 7542
 7543
 7544
 7545
 7546
 7547
 7548
 7549
 7550
 7551
 7552
 7553
 7554
 7555
 7556
 7557
 7558
 7559
 7560
 7561
 7562
 7563
 7564
 7565
 7566
 7567
 7568
 7569
 7570
 7571
 7572
 7573
 7574
 7575
 7576
 7577
 7578
 7579
 7580
 7581
 7582
 7583
 7584
 7585
 7586
 7587
 7588
 7589
 7590
 7591
 7592
 7593
 7594
 7595
 7596
 7597
 7598
 7599
 7600
 7601
 7602
 7603
 7604
 7605
 7606
 7607
 7608
 7609
 7610
 7611
 7612
 7613
 7614
 7615
 7616
 7617
 7618
 7619
 7620
 7621
 7622
 7623
 7624
 7625
 7626
 7627
 7628
 7629
 7630
 7631
 7632
 7633
 7634
 7635
 7636
 7637
 7638
 7639
 7640
 7641
 7642
 7643
 7644
 7645
 7646
 7647
 7648
 7649
 7650
 7651
 7652
 7653
 7654
 7655
 7656
 7657
 7658
 7659
 7660
 7661
 7662
 7663
 7664
 7665
 7666
 7667
 7668
 7669
 7670
 7671
 7672
 7673
 7674
 7675
 7676
 7677
 7678
 7679
 7680
 7681
 7682
 7683
 7684
 7685
 7686
 7687
 7688
 7689
 7690
 7691
 7692
 7693
 7694
 7695
 7696
 7697
 7698
 7699
 7700
 7701
 7702
 7703
 7704
 7705
 7706
 7707
 7708
 7709
 7710
 7711
 7712
 7713
 7714
 7715
 7716
 7717
 7718
 7719
 7720
 7721
 7722
 7723
 7724
 7725
 7726
 7727
 7728
 7729
 7730
 7731
 7732
 7733
 7734
 7735
 7736
 7737
 7738
 7739
 7740
 7741
 7742
 7743
 7744
 7745
 7746
 7747
 7748
 7749
 7750
 7751
 7752
 7753
 7754
 7755
 7756
 7757
 7758
 7759
 7760
 7761
 7762
 7763
 7764
 7765
 7766
 7767
 7768
 7769
 7770
 7771
 7772
 7773
 7774
 7775
 7776
 7777
 7778
 7779
 7780
 7781
 7782
 7783
 7784
 7785
 7786
 7787
 7788
 7789
 7790
 7791
 7792
 7793
 7794
 7795
 7796
 7797
 7798
 7799
 7800
 7801
 7802
 7803
 7804
 7805
 7806
 7807
 7808
 7809
 7810
 7811
 7812
 7813
 7814
 7815
 7816
 7817
 7818
 7819
 7820
 7821
 7822
 7823
 7824
 7825
 7826
 7827
 7828
 7829
 7830
 7831
 7832
 7833
 7834
 7835
 7836
 7837
 7838
 7839
 7840
 7841
 7842
 7843
 7844
 7845
 7846
 7847
 7848
 7849
 7850
 7851
 7852
 7853
 7854
 7855
 7856
 7857
 7858
 7859
 7860
 7861
 7862
 7863
 7864
 7865
 7866
 7867
 7868
 7869
 7870
 7871
 7872
 7873
 7874
 7875
 7876
 7877
 7878
 7879
 7880
 7881
 7882
 7883
 7884
 7885
 7886
 7887
 7888
 7889
 7890
 7891
 7892
 7893
 7894
 7895
 7896
 7897
 7898
 7899
 7900
 7901
 7902
 7903
 7904
 7905
 7906
 7907
 7908
 7909
 7910
 7911
 7912
 7913
 7914
 7915
 7916
 7917
 7918
 7919
 7920
 7921
 7922
 7923
 7924
 7925
 7926
 7927
 7928
 7929
 7930
 7931
 7932
 7933
 7934
 7935
 7936
 7937
 7938
 7939
 7940
 7941
 7942
 7943
 7944
 7945
 7946
 7947
 7948
 7949
 7950
 7951
 7952
 7953
 7954
 7955
 7956
 7957
 7958
 7959
 7960
 7961
 7962
 7963
 7964
 7965
 7966
 7967
 7968
 7969
 7970
 7971
 7972
 7973
 7974
 7975
 7976
 7977
 7978
 7979
 7980
 7981
 7982
 7983
 7984
 7985
 7986
 7987
 7988
 7989
 7990
 7991
 7992
 7993
 7994
 7995
 7996
 7997
 7998
 7999
 8000
 8001
 8002
 8003
 8004
 8005
 8006
 8007
 8008
 8009
 8010
 8011
 8012
 8013
 8014
 8015
 8016
 8017
 8018
 8019
 8020
 8021
 8022
 8023
 8024
 8025
 8026
 8027
 8028
 8029
 8030
 8031
 8032
 8033
 8034
 8035
 8036
 8037
 8038
 8039
 8040
 8041
 8042
 8043
 8044
 8045
 8046
 8047
 8048
 8049
 8050
 8051
 8052
 8053
 8054
 8055
 8056
 8057
 8058
 8059
 8060
 8061
 8062
 8063
 8064
 8065
 8066
 8067
 8068
 8069
 8070
 8071
 8072
 8073
 8074
 8075
 8076
 8077
 8078
 8079
 8080
 8081
 8082
 8083
 8084
 8085
 8086
 8087
 8088
 8089
 8090
 8091
 8092
 8093
 8094
 8095
 8096
 8097
 8098
 8099
 8100
 8101
 8102
 8103
 8104
 8105
 8106
 8107
 8108
 8109
 8110
 8111
 8112
 8113
 8114
 8115
 8116
 8117
 8118
 8119
 8120
 8121
 8122
 8123
 8124
 8125
 8126
 8127
 8128
 8129
 8130
 8131
 8132
 8133
 8134
 8135
 8136
 8137
 8138
 8139
 8140
 8141
 8142
 8143
 8144
 8145
 8146
 8147
 8148
 8149
 8150
 8151
 8152
 8153
 8154
 8155
 8156
 8157
 8158
 8159
 8160
 8161
 8162
 8163
 8164
 8165
 8166
 8167
 8168
 8169
 8170
 8171
 8172
 8173
 8174
 8175
 8176
 8177
 8178
 8179
 8180
 8181
 8182
 8183
 8184
 8185
 8186
 8187
 8188
 8189
 8190
 8191
 8192
 8193
 8194
 8195
 8196
 8197
 8198
 8199
 8200
 8201
 8202
 8203
 8204
 8205
 8206
 8207
 8208
 8209
 8210
 8211
 8212
 8213
 8214
 8215
 8216
 8217
 8218
 8219
 8220
 8221
 8222
 8223
 8224
 8225
 8226
 8227
 8228
 8229
 8230
 8231
 8232
 8233
 8234
 8235
 8236
 8237
 8238
 8239
 8240
 8241
 8242
 8243
 8244
 8245
 8246
 8247
 8248
 8249
 8250
 8251
 8252
 8253
 8254
 8255
 8256
 8257
 8258
 8259
 8260
 8261
 8262
 8263
 8264
 8265
 8266
 8267
 8268
 8269
 8270
 8271
 8272
 8273
 8274
 8275
 8276
 8277
 8278
 8279
 8280
 8281
 8282
 8283
 8284
 8285
 8286
 8287
 8288
 8289
 8290
 8291
 8292
 8293
 8294
 8295
 8296
 8297
 8298
 8299
 8300
 8301
 8302
 8303
 8304
 8305
 8306
 8307
 8308
 8309
 8310
 8311
 8312
 8313
 8314
 8315
 8316
 8317
 8318
 8319
 8320
 8321
 8322
 8323
 8324
 8325
 8326
 8327
 8328
 8329
 8330
 8331
 8332
 8333
 8334
 8335
 8336
 8337
 8338
 8339
 8340
 8341
 8342
 8343
 8344
 8345
 8346
 8347
 8348
 8349
 8350
 8351
 8352
 8353
 8354
 8355
 8356
 8357
 8358
 8359
 8360
 8361
 8362
 8363
 8364
 8365
 8366
 8367
 8368
 8369
 8370
 8371
 8372
 8373
 8374
 8375
 8376
 8377
 8378
 8379
 8380
 8381
 8382
 8383
 8384
 8385
 8386
 8387
 8388
 8389
 8390
 8391
 8392
 8393
 8394
 8395
 8396
 8397
 8398
 8399
 8400
 8401
 8402
 8403
 8404
 8405
 8406
 8407
 8408
 8409
 8410
 8411
 8412
 8413
 8414
 8415
 8416
 8417
 8418
 8419
 8420
 8421
 8422
 8423
 8424
 8425
 8426
 8427
 8428
 8429
 8430
 8431
 8432
 8433
 8434
 8435
 8436
 8437
 8438
 8439
 8440
 8441
 8442
 8443
 8444
 8445
 8446
 8447
 8448
 8449
 8450
 8451
 8452
 8453
 8454
 8455
 8456
 8457
 8458
 8459
 8460
 8461
 8462
 8463
 8464
 8465
 8466
 8467
 8468
 8469
 8470
 8471
 8472
 8473
 8474
 8475
 8476
 8477
 8478
 8479
 8480
 8481
 8482
 8483
 8484
 8485
 8486
 8487
 8488
 8489
 8490
 8491
 8492
 8493
 8494
 8495
 8496
 8497
 8498
 8499
 8500
 8501
 8502
 8503
 8504
 8505
 8506
 8507
 8508
 8509
 8510
 8511
 8512
 8513
 8514
 8515
 8516
 8517
 8518
 8519
 8520
 8521
 8522
 8523
 8524
 8525
 8526
 8527
 8528
 8529
 8530
 8531
 8532
 8533
 8534
 8535
 8536
 8537
 8538
 8539
 8540
 8541
 8542
 8543
 8544
 8545
 8546
 8547
 8548
 8549
 8550
 8551
 8552
 8553
 8554
 8555
 8556
 8557
 8558
 8559
 8560
 8561
 8562
 8563
 8564
 8565
 8566
 8567
 8568
 8569
 8570
 8571
 8572
 8573
 8574
 8575
 8576
 8577
 8578
 8579
 8580
 8581
 8582
 8583
 8584
 8585
 8586
 8587
 8588
 8589
 8590
 8591
 8592
 8593
 8594
 8595
 8596
 8597
 8598
 8599
 8600
 8601
 8602
 8603
 8604
 8605
 8606
 8607
 8608
 8609
 8610
 8611
 8612
 8613
 8614
 8615
 8616
 8617
 8618
 8619
 8620
 8621
 8622
 8623
 8624
 8625
 8626
 8627
 8628
 8629
 8630
 8631
 8632
 8633
 8634
 8635
 8636
 8637
 8638
 8639
 8640
 8641
 8642
 8643
 8644
 8645
 8646
 8647
 8648
 8649
 8650
 8651
 8652
 8653
 8654
 8655
 8656
 8657
 8658
 8659
 8660
 8661
 8662
 8663
 8664
 8665
 8666
 8667
 8668
 8669
 8670
 8671
 8672
 8673
 8674
 8675
 8676
 8677
 8678
 8679
 8680
 8681
 8682
 8683
 8684
 8685
 8686
 8687
 8688
 8689
 8690
 8691
 8692
 8693
 8694
 8695
 8696
 8697
 8698
 8699
 8700
 8701
 8702
 8703
 8704
 8705
 8706
 8707
 8708
 8709
 8710
 8711
 8712
 8713
 8714
 8715
 8716
 8717
 8718
 8719
 8720
 8721
 8722
 8723
 8724
 8725
 8726
 8727
 8728
 8729
 8730
 8731
 8732
 8733
 8734
 8735
 8736
 8737
 8738
 8739
 8740
 8741
 8742
 8743
 8744
 8745
 8746
 8747
 8748
 8749
 8750
 8751
 8752
 8753
 8754
 8755
 8756
 8757
 8758
 8759
 8760
 8761
 8762
 8763
 8764
 8765
 8766
 8767
 8768
 8769
 8770
 8771
 8772
 8773
 8774
 8775
 8776
 8777
 8778
 8779
 8780
 8781
 8782
 8783
 8784
 8785
 8786
 8787
 8788
 8789
 8790
 8791
 8792
 8793
 8794
 8795
 8796
 8797
 8798
 8799
 8800
 8801
 8802
 8803
 8804
 8805
 8806
 8807
 8808
 8809
 8810
 8811
 8812
 8813
 8814
 8815
 8816
 8817
 8818
 8819
 8820
 8821
 8822
 8823
 8824
 8825
 8826
 8827
 8828
 8829
 8830
 8831
 8832
 8833
 8834
 8835
 8836
 8837
 8838
 8839
 8840
 8841
 8842
 8843
 8844
 8845
 8846
 8847
 8848
 8849
 8850
 8851
 8852
 8853
 8854
 8855
 8856
 8857
 8858
 8859
 8860
 8861
 8862
 8863
 8864
 8865
 8866
 8867
 8868
 8869
 8870
 8871
 8872
 8873
 8874
 8875
 8876
 8877
 8878
 8879
 8880
 8881
 8882
 8883
 8884
 8885
 8886
 8887
 8888
 8889
 8890
 8891
 8892
 8893
 8894
 8895
 8896
 8897
 8898
 8899
 8900
 8901
 8902
 8903
 8904
 8905
 8906
 8907
 8908
 8909
 8910
 8911
 8912
 8913
 8914
 8915
 8916
 8917
 8918
 8919
 8920
 8921
 8922
 8923
 8924
 8925
 8926
 8927
 8928
 8929
 8930
 8931
 8932
 8933
 8934
 8935
 8936
 8937
 8938
 8939
 8940
 8941
 8942
 8943
 8944
 8945
 8946
 8947
 8948
 8949
 8950
 8951
 8952
 8953
 8954
 8955
 8956
 8957
 8958
 8959
 8960
 8961
 8962
 8963
 8964
 8965
 8966
 8967
 8968
 8969
 8970
 8971
 8972
 8973
 8974
 8975
 8976
 8977
 8978
 8979
 8980
 8981
 8982
 8983
 8984
 8985
 8986
 8987
 8988
 8989
 8990
 8991
 8992
 8993
 8994
 8995
 8996
 8997
 8998
 8999
 9000
 9001
 9002
 9003
 9004
 9005
 9006
 9007
 9008
 9009
 9010
 9011
 9012
 9013
 9014
 9015
 9016
 9017
 9018
 9019
 9020
 9021
 9022
 9023
 9024
 9025
 9026
 9027
 9028
 9029
 9030
 9031
 9032
 9033
 9034
 9035
 9036
 9037
 9038
 9039
 9040
 9041
 9042
 9043
 9044
 9045
 9046
 9047
 9048
 9049
 9050
 9051
 9052
 9053
 9054
 9055
 9056
 9057
 9058
 9059
 9060
 9061
 9062
 9063
 9064
 9065
 9066
 9067
 9068
 9069
 9070
 9071
 9072
 9073
 9074
 9075
 9076
 9077
 9078
 9079
 9080
 9081
 9082
 9083
 9084
 9085
 9086
 9087
 9088
 9089
 9090
 9091
 9092
 9093
 9094
 9095
 9096
 9097
 9098
 9099
 9100
 9101
 9102
 9103
 9104
 9105
 9106
 9107
 9108
 9109
 9110
 9111
 9112
 9113
 9114
 9115
 9116
 9117
 9118
 9119
 9120
 9121
 9122
 9123
 9124
 9125
 9126
 9127
 9128
 9129
 9130
 9131
 9132
 9133
 9134
 9135
 9136
 9137
 9138
 9139
 9140
 9141
 9142
 9143
 9144
 9145
 9146
 9147
 9148
 9149
 9150
 9151
 9152
 9153
 9154
 9155
 9156
 9157
 9158
 9159
 9160
 9161
 9162
 9163
 9164
 9165
 9166
 9167
 9168
 9169
 9170
 9171
 9172
 9173
 9174
 9175
 9176
 9177
 9178
 9179
 9180
 9181
 9182
 9183
 9184
 9185
 9186
 9187
 9188
 9189
 9190
 9191
 9192
 9193
 9194
 9195
 9196
 9197
 9198
 9199
 9200
 9201
 9202
 9203
 9204
 9205
 9206
 9207
 9208
 9209
 9210
 9211
 9212
 9213
 9214
 9215
 9216
 9217
 9218
 9219
 9220
 9221
 9222
 9223
 9224
 9225
 9226
 9227
 9228
 9229
 9230
 9231
 9232
 9233
 9234
 9235
 9236
 9237
 9238
 9239
 9240
 9241
 9242
 9243
 9244
 9245
 9246
 9247
 9248
 9249
 9250
 9251
 9252
 9253
 9254
 9255
 9256
 9257
 9258
 9259
 9260
 9261
 9262
 9263
 9264
 9265
 9266
 9267
 9268
 9269
 9270
 9271
 9272
 9273
 9274
 9275
 9276
 9277
 9278
 9279
 9280
 9281
 9282
 9283
 9284
 9285
 9286
 9287
 9288
 9289
 9290
 9291
 9292
 9293
 9294
 9295
 9296
 9297
 9298
 9299
 9300
 9301
 9302
 9303
 9304
 9305
 9306
 9307
 9308
 9309
 9310
 9311
 9312
 9313
 9314
 9315
 9316
 9317
 9318
 9319
 9320
 9321
 9322
 9323
 9324
 9325
 9326
 9327
 9328
 9329
 9330
 9331
 9332
 9333
 9334
 9335
 9336
 9337
 9338
 9339
 9340
 9341
 9342
 9343
 9344
 9345
 9346
 9347
 9348
 9349
 9350
 9351
 9352
 9353
 9354
 9355
 9356
 9357
 9358
 9359
 9360
 9361
 9362
 9363
 9364
 9365
 9366
 9367
 9368
 9369
 9370
 9371
 9372
 9373
 9374
 9375
 9376
 9377
 9378
 9379
 9380
 9381
 9382
 9383
 9384
 9385
 9386
 9387
 9388
 9389
 9390
 9391
 9392
 9393
 9394
 9395
 9396
 9397
 9398
 9399
 9400
 9401
 9402
 9403
 9404
 9405
 9406
 9407
 9408
 9409
 9410
 9411
 9412
 9413
 9414
 9415
 9416
 9417
 9418
 9419
 9420
 9421
 9422
 9423
 9424
 9425
 9426
 9427
 9428
 9429
 9430
 9431
 9432
 9433
 9434
 9435
 9436
 9437
 9438
 9439
 9440
 9441
 9442
 9443
 9444
 9445
 9446
 9447
 9448
 9449
 9450
 9451
 9452
 9453
 9454
 9455
 9456
 9457
 9458
 9459
 9460
 9461
 9462
 9463
 9464
 9465
 9466
 9467
 9468
 9469
 9470
 9471
 9472
 9473
 9474
 9475
 9476
 9477
 9478
 9479
 9480
 9481
 9482
 9483
 9484
 9485
 9486
 9487
 9488
 9489
 9490
 9491
 9492
 9493
 9494
 9495
 9496
 9497
 9498
 9499
 9500
 9501
 9502
 9503
 9504
 9505
 9506
 9507
 9508
 9509
 9510
 9511
 9512
 9513
 9514
 9515
 9516
 9517
 9518
 9519
 9520
 9521
 9522
 9523
 9524
 9525
 9526
 9527
 9528
 9529
 9530
 9531
 9532
 9533
 9534
 9535
 9536
 9537
 9538
 9539
 9540
 9541
 9542
 9543
 9544
 9545
 9546
 9547
 9548
 9549
 9550
 9551
 9552
 9553
 9554
 9555
 9556
 9557
 9558
 9559
 9560
 9561
 9562
 9563
 9564
 9565
 9566
 9567
 9568
 9569
 9570
 9571
 9572
 9573
 9574
 9575
 9576
 9577
 9578
 9579
 9580
 9581
 9582
 9583
 9584
 9585
 9586
 9587
 9588
 9589
 9590
 9591
 9592
 9593
 9594
 9595
 9596
 9597
 9598
 9599
 9600
 9601
 9602
 9603
 9604
 9605
 9606
 9607
 9608
 9609
 9610
 9611
 9612
 9613
 9614
 9615
 9616
 9617
 9618
 9619
 9620
 9621
 9622
 9623
 9624
 9625
 9626
 9627
 9628
 9629
 9630
 9631
 9632
 9633
 9634
 9635
 9636
 9637
 9638
 9639
 9640
 9641
 9642
 9643
 9644
 9645
 9646
 9647
 9648
 9649
 9650
 9651
 9652
 9653
 9654
 9655
 9656
 9657
 9658
 9659
 9660
 9661
 9662
 9663
 9664
 9665
 9666
 9667
 9668
 9669
 9670
 9671
 9672
 9673
 9674
 9675
 9676
 9677
 9678
 9679
 9680
 9681
 9682
 9683
 9684
 9685
 9686
 9687
 9688
 9689
 9690
 9691
 9692
 9693
 9694
 9695
 9696
 9697
 9698
 9699
 9700
 9701
 9702
 9703
 9704
 9705
 9706
 9707
 9708
 9709
 9710
 9711
 9712
 9713
 9714
 9715
 9716
 9717
 9718
 9719
 9720
 9721
 9722
 9723
 9724
 9725
 9726
 9727
 9728
 9729
 9730
 9731
 9732
 9733
 9734
 9735
 9736
 9737
 9738
 9739
 9740
 9741
 9742
 9743
 9744
 9745
 9746
 9747
 9748
 9749
 9750
 9751
 9752
 9753
 9754
 9755
 9756
 9757
 9758
 9759
 9760
 9761
 9762
 9763
 9764
 9765
 9766
 9767
 9768
 9769
 9770
 9771
 9772
 9773
 9774
 9775
 9776
 9777
 9778
 9779
 9780
 9781
 9782
 9783
 9784
 9785
 9786
 9787
 9788
 9789
 9790
 9791
 9792
 9793
 9794
 9795
 9796
 9797
 9798
 9799
 9800
 9801
 9802
 9803
 9804
 9805
 9806
 9807
 9808
 9809
 9810
 9811
 9812
 9813
 9814
 9815
 9816
 9817
 9818
 9819
 9820
 9821
 9822
 9823
 9824
 9825
 9826
 9827
 9828
 9829
 9830
 9831
 9832
 9833
 9834
 9835
 9836
 9837
 9838
 9839
 9840
 9841
 9842
 9843
 9844
 9845
 9846
 9847
 9848
 9849
 9850
 9851
 9852
 9853
 9854
 9855
 9856
 9857
 9858
 9859
 9860
 9861
 9862
 9863
 9864
 9865
 9866
 9867
 9868
 9869
 9870
 9871
 9872
 9873
 9874
 9875
 9876
 9877
 9878
 9879
 9880
 9881
 9882
 9883
 9884
 9885
 9886
 9887
 9888
 9889
 9890
 9891
 9892
 9893
 9894
 9895
 9896
 9897
 9898
 9899
 9900
 9901
 9902
 9903
 9904
 9905
 9906
 9907
 9908
 9909
 9910
 9911
 9912
 9913
 9914
 9915
 9916
 9917
 9918
 9919
 9920
 9921
 9922
 9923
 9924
 9925
 9926
 9927
 9928
 9929
 9930
 9931
 9932
 9933
 9934
 9935
 9936
 9937
 9938
 9939
 9940
 9941
 9942
 9943
 9944
 9945
 9946
 9947
 9948
 9949
 9950
 9951
 9952
 9953
 9954
 9955
 9956
 9957
 9958
 9959
 9960
 9961
 9962
 9963
 9964
 9965
 9966
 9967
 9968
 9969
 9970
 9971
 9972
 9973
 9974
 9975
 9976
 9977
 9978
 9979
 9980
 9981
 9982
 9983
 9984
 9985
 9986
 9987
 9988
 9989
 9990
 9991
 9992
 9993
 9994
 9995
 9996
 9997
 9998
 9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>Sequence assembly with MIRA 4</title><link rel="stylesheet" type="text/css" href="doccss/miradocstyle.css"><meta name="generator" content="DocBook XSL Stylesheets V1.79.1"></head><body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF"><div class="book"><div class="titlepage"><div><div><h1 class="title"><a name="idp32051920"></a>Sequence assembly with MIRA 4</h1></div><div><h2 class="subtitle">
  The Definitive Guide
</h2></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><span class="contrib">Main author</span> <code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><div class="othercredit"><h3 class="othercredit"><span class="firstname">Jacqueline</span> <span class="surname">Weber</span></h3><span class="contrib">Extensive review of early reference manual
  </span> </div></div><div><div class="othercredit"><h3 class="othercredit"><span class="firstname">Andrea</span> <span class="surname">Hörster</span></h3><span class="contrib">Extensive review of early reference manual
  </span> </div></div><div><div class="othercredit"><h3 class="othercredit"><span class="firstname">Katrina</span> <span class="surname">Dlugosch</span></h3><span class="contrib">Draft for section on preprocessing of ESTs in EST manual
  </span> </div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div><div><div class="legalnotice"><a name="idp32056384"></a><p>
This documentation is licensed under the Creative Commons
Attribution-NonCommercial-ShareAlike 3.0 Unported License. To view a copy of
this license, visit <a class="ulink" href="http://creativecommons.org/licenses/by-nc-sa/3.0/" target="_top">http://creativecommons.org/licenses/by-nc-sa/3.0/</a> or send a letter to
Creative Commons, 171 Second Street, Suite 300, San Francisco, California,
94105, USA.
</p></div></div></div><hr></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="preface"><a href="#idp37973136">Preface</a></span></dt><dt><span class="chapter"><a href="#chap_intro">1. Introduction to MIRA</a></span></dt><dt><span class="chapter"><a href="#chap_installation">2. Installing MIRA</a></span></dt><dt><span class="chapter"><a href="#chap_reference">3. MIRA 4 reference manual</a></span></dt><dt><span class="chapter"><a href="#chap_dataprep">4. Preparing data</a></span></dt><dt><span class="chapter"><a href="#chap_denovo">5. De-novo assemblies</a></span></dt><dt><span class="chapter"><a href="#chap_mapping">6. Mapping assemblies</a></span></dt><dt><span class="chapter"><a href="#chap_est">7. EST / RNASeq assemblies</a></span></dt><dt><span class="chapter"><a href="#chap_specialparams">8. Parameters for special situations</a></span></dt><dt><span class="chapter"><a href="#chap_results">9. Working with the results of MIRA</a></span></dt><dt><span class="chapter"><a href="#chap_mutils">10. Utilities in the MIRA package</a></span></dt><dt><span class="chapter"><a href="#chap_hard">11. Assembly of <span class="emphasis"><em>hard</em></span> genome or EST / RNASeq projects</a></span></dt><dt><span class="chapter"><a href="#chap_seqtechdesc">12. Description of sequencing technologies</a></span></dt><dt><span class="chapter"><a href="#chap_seqadvice">13. Some advice when going into a sequencing project</a></span></dt><dt><span class="chapter"><a href="#chap_bitsandpieces">14. Bits and pieces</a></span></dt><dt><span class="chapter"><a href="#chap_faq">15. Frequently asked questions</a></span></dt><dt><span class="chapter"><a href="#chap_maf">16. The MAF format</a></span></dt><dt><span class="chapter"><a href="#chap_logfiles">17. Log and temporary files used by MIRA</a></span></dt></dl></div><div class="list-of-figures"><p><b>List of Figures</b></p><dl><dt>1.1. <a href="#chap_intro::srmc_in_454sxahyb_1stpass.png">
	  How MIRA learns from misassemblies (1)
	</a></dt><dt>1.2. <a href="#chap_intro::srmc_in_454sxahyb_lastpass1.png">
	  How MIRA learns from misassemblies (2)
	</a></dt><dt>1.3. <a href="#chap_intro::srmc_in_454sxahyb_lastpass2.png">
	  How MIRA learns from misassemblies (3)
	</a></dt><dt>1.4. <a href="#chap_intro::gcb99_replocator.png">
	  Slides presenting the repeat locator at the GCB 99
	</a></dt><dt>1.5. <a href="#chap_intro::gcb99_edit.png">
	  Slides presenting the Edit automatic Sanger editor at the GCB 99
	</a></dt><dt>1.6. <a href="#chap_intro::san_autoedit1.png">
	  Sanger assembly without EdIt automatic editing routines
	</a></dt><dt>1.7. <a href="#chap_intro::san_autoedit2.png">
	  Sanger assembly with EdIt automatic editing routines
	</a></dt><dt>1.8. <a href="#chap_intro::454_autoedit1.png">
	  454 assembly without 454 automatic editing routines
	</a></dt><dt>1.9. <a href="#chap_intro::454_autoedit2.png">
	  454 assembly with 454 automatic editing routines
	</a></dt><dt>1.10. <a href="#chap_intro::haf5_haf2_contigcoverage_ovals.png">
	  Coverage of a contig.
	</a></dt><dt>1.11. <a href="#chap_intro::haf5_repend_rrna.png">
	  Repetitive end of a contig
	</a></dt><dt>1.12. <a href="#chap_intro::haf2_end_nomoredata.png">
	  Non-repetitive end of a contig
	</a></dt><dt>1.13. <a href="#chap_intro::454sxa_stms_hybdenovo.png">
	  MIRA pointing out problems in hybrid assemblies (1)
	</a></dt><dt>1.14. <a href="#chap_intro::454san_stmu_hybdenovo.png">
	  MIRA pointing out problems in hybrid assemblies (2)
	</a></dt><dt>1.15. <a href="#chap_intro::sxa_cer_reads1.png">
	  Coverage equivalent reads (CERs) explained.
	</a></dt><dt>1.16. <a href="#chap_intro::sxa_cer_reads2.png">
	  Coverage equivalent reads let SNPs become very visible in assembly viewers
	</a></dt><dt>1.17. <a href="#chap_intro::sxa_sroc_lenski2.png">
	  SNP tags in a MIRA assembly
	</a></dt><dt>1.18. <a href="#chap_intro::sxa_mcvc_lenski.png">
	  Tag pointing out a large deletion in a MIRA mapping assembly
	</a></dt><dt>9.1. <a href="#chap_res::results_miraconvert.png">
	  Format conversions with <span class="command"><strong>miraconvert</strong></span>
	</a></dt><dt>9.2. <a href="#chap_res::results_mira2other.png">
	  Conversions needed for other tools.
	</a></dt><dt>9.3. <a href="#haf_danger_join_notok.png">
	      Join at a repetitive site which should not be performed due to
	      missing spanning templates.
	    </a></dt><dt>9.4. <a href="#haf_danger_join_ok.png">
	      Join at a repetitive site which should be performed due to
	      spanning templates being good.
	    </a></dt><dt>9.5. <a href="#454_stacks_join.png">
	      Pseudo-repeat in 454 data due to sequencing artifacts
	    </a></dt><dt>9.6. <a href="#chap_sol::sxa_sroc_lenski1.png">
	      "SROc" tag showing a SNP position in a Solexa mapping
	      assembly.
	    </a></dt><dt>9.7. <a href="#chap_sol::sxa_sroc_lenski2.png">
	      "SROc" tag showing a SNP/indel position in a Solexa mapping
	      assembly.
	    </a></dt><dt>9.8. <a href="#chap_sol::sxa_mcvc_lenski.png">
	      "MCVc" tag (dark red stretch in figure) showing a genome
	      deletion in Solexa mapping assembly.
	    </a></dt><dt>9.9. <a href="#chap_sol::sxa_wrmcsrmc_hiding_lenski1.png">
	      An IS150 insertion hiding behind a WRMc and a SRMc tags
	    </a></dt><dt>9.10. <a href="#chap_sol::sxa_xmastree_lenski1.png">
	      A 16 base pair deletion leading to a SROc/UNsC xmas-tree
	    </a></dt><dt>9.11. <a href="#chap_sol::sxa_xmastree_lenski2.png">
	      An IS186 insertion leading to a SROc/UNsC xmas-tree
	    </a></dt><dt>12.1. <a href="#sxa_unsc_ggcxg2_lenski.png">
	    The Solexa GGCxG problem.
	  </a></dt><dt>12.2. <a href="#sxa_unsc_ggc1_lenski.png">
	    The Solexa GGC problem, forward example
	  </a></dt><dt>12.3. <a href="#sxa_unsc_ggc4_lenski.png">
	    The Solexa GGC problem, reverse example
	  </a></dt><dt>12.4. <a href="#sxa_xmastree_lenski2.png">
	    A genuine place of interest almost masked by the
	    <code class="literal">GGCxG</code> problem.
	  </a></dt><dt>12.5. <a href="#sxa_gcbias_nobias2008.png">
	    Example for no GC coverage bias in 2008 Solexa data.
	  </a></dt><dt>12.6. <a href="#sxa_gcbias_bias2009.png">
	    Example for GC coverage bias starting Q3 2009 in Solexa data.
	  </a></dt><dt>12.7. <a href="#sxa_gcbias_comp20082009.png">
	    Example for GC coverage bias, direct comparison 2008 / 2010 data.
	  </a></dt><dt>12.8. <a href="#chap_iontor::ion_dh10bgoodB13.png">
	Example for good IonTorrent data (100bp reads)
      </a></dt><dt>12.9. <a href="#chap_iontor::iontor_indelhpexample.png">
	  Example for problematic IonTorrent data (100bp reads)
	</a></dt><dt>12.10. <a href="#chap_iontor::ion_dh10bdirdepindel.png.png">
	  Example for a sequencing direction dependent indel
	</a></dt></dl></div><div class="preface"><div class="titlepage"><div><div><h1 class="title"><a name="idp37973136"></a>Preface</h1></div></div></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
    <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">How much intelligence does one need to sneak upon lettuce?
    </span>&#8221;</span></em></span>
  </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><p>
  This "book" is actually the result of an exercise in self-defense. It
  contains texts from several years of help files, mails, postings, questions,
  answers etc.pp concerning MIRA and assembly projects one can do with it.
</p><p>
  I never really intended to push MIRA. It started out as a PhD thesis and I
  subsequently continued development when I needed something to be done which
  other programs couldn't do at the time. But MIRA has always been available
  as binary on the Internet since 1999 ... and as Open Source since
  2007. Somehow, MIRA seems to have caught the attention of more than just a
  few specialised sequencing labs and over the years I've seen an ever growing
  number of mails in my inbox and on the MIRA mailing list. Both from people
  having been "since ever" in the sequencing business as well as from labs or
  people just getting their feet wet in the area.
</p><p>
  The help files -- and through them this book -- sort of reflect this
  development. Most of the chapters<a href="#ftn.idp39850480" class="footnote" name="idp39850480"><sup class="footnote">[1]</sup></a> contain both very specialised
  topics as well as step-by-step walk-throughs intended to help people to get
  their assembly projects going. Some parts of the documentation are written
  in a decidedly non-scientific way. Please excuse, time for rewriting mails
  somewhat lacking, some texts were re-used almost verbatim.
</p><p>
  The last few years have seen tremendous change in the sequencing
  technologies and MIRA 4 reflects that: core data structures and
  routines had to be thrown overboard and replaced with faster and/or more
  versatile versions suited for the broad range of technologies and use-cases
  I am currently running MIRA with.
</p><p>
  Nothing is perfect, and both MIRA and this documentation (even if it is
  rather pompously called <span class="emphasis"><em>Definitive Guide</em></span>) are far from
  it. If you spot an error either in MIRA or this manual, feel free to report
  it. Or, even better, correct it if you can. At least with the manual files
  it should be easy: they're basically just some decorated text files.
</p><p>
  I hope that MIRA will be as useful to you as it has been to me. Have a lot
  of fun with it.
</p><p>
  Rheinfelden, February 2014
</p><p>
  Bastien Chevreux
</p><div class="footnotes"><br><hr style="width:100; text-align:left;margin-left: 0"><div id="ftn.idp39850480" class="footnote"><p><a href="#idp39850480" class="para"><sup class="para">[1] </sup></a>Avid readers of David
  Gerrold will certainly recognise the quotes from his books at the beginning
  of each chapter</p></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_intro"></a>Chapter 1. Introduction to MIRA</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_intro_whatismira">1.1. 
      What is MIRA?
    </a></span></dt><dt><span class="sect1"><a href="#sect_wheretostartreading">1.2. 
      What to read in this manual and where to start reading?
    </a></span></dt><dt><span class="sect1"><a href="#sect_intro_miraquicktour">1.3. 
      The MIRA quick tour
    </a></span></dt><dt><span class="sect1"><a href="#sect_for_which_data_sets_to_use_mira_and_for_which_not">1.4. 
      For which data sets to use MIRA and for which not
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect3_genome_denovo">1.4.1. 
	Genome de-novo
      </a></span></dt><dt><span class="sect2"><a href="#sect_genome_mapping">1.4.2. 
	Genome mapping
      </a></span></dt><dt><span class="sect2"><a href="#sect3_ests_rnaseq">1.4.3. 
	ESTs / RNASeq
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_intro_specialfeatures">1.5. 
      Any special features I might be interested in?
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_intro_miradiscernsrepeats">1.5.1. 
	MIRA learns to discern non-perfect repeats, leading to better assemblies
      </a></span></dt><dt><span class="sect2"><a href="#sect_intro_automatic_editors">1.5.2. 
	MIRA has integrated editors for data from Sanger, 454, IonTorrent sequencing
      </a></span></dt><dt><span class="sect2"><a href="#sect_intro_whycontigsend">1.5.3. 
	MIRA lets you see why contigs end where they end
      </a></span></dt><dt><span class="sect2"><a href="#sect_intro_stmshybrid_tags">1.5.4. 
	MIRA tags problematic decisions in hybrid assemblies
      </a></span></dt><dt><span class="sect2"><a href="#sect_intro_cer_reads">1.5.5. 
	MIRA allows older finishing programs to cope with amount data in Solexa
	mapping projects
      </a></span></dt><dt><span class="sect2"><a href="#sect_intro_mapping_tags">1.5.6. 
	MIRA tags SNPs and other features, outputs result files
	for biologists
      </a></span></dt><dt><span class="sect2"><a href="#sect_intro_miramuchmore">1.5.7. 
	MIRA has ... much more
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_intro_versions_licenses_disclaimer_and_copyright">1.6. 
      Versions, Licenses, Disclaimer and Copyright
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_intro_versions">1.6.1. 
	Versions
      </a></span></dt><dt><span class="sect2"><a href="#sect_intro_licenses">1.6.2. 
	License
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_intro_licensemira">1.6.2.1. 
	  MIRA
	</a></span></dt><dt><span class="sect3"><a href="#sect_intro_licensedocs">1.6.2.2. 
	  Documentation
	</a></span></dt></dl></dd><dt><span class="sect2"><a href="#sect_intro_copyright">1.6.3. 
	Copyright
      </a></span></dt><dt><span class="sect2"><a href="#sect_intro_external_libraries">1.6.4. 
	External libraries
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_intro_getting_help___mailing_lists___reporting_bugs">1.7. 
      Getting help / Mailing lists / Reporting bugs
    </a></span></dt><dt><span class="sect1"><a href="#sect_intro_author">1.8. 
      Author
    </a></span></dt><dt><span class="sect1"><a href="#sect_intro_miscellaneous">1.9. 
      Miscellaneous
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_intro_citations">1.9.1. 
	Citing MIRA
      </a></span></dt><dt><span class="sect2"><a href="#sect_intro_postcards_gold_and_jewellery">1.9.2. 
	Postcards, gold and jewellery
      </a></span></dt></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">Half of being smart is to know what you're dumb at.
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_intro_whatismira"></a>1.1. 
      What is MIRA?
    </h2></div></div></div><p>
      MIRA is a multi-pass DNA sequence data assembler/mapper for whole
      genome and EST/RNASeq projects. MIRA assembles/maps reads gained by
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	  electrophoresis sequencing (aka Sanger sequencing)
	</p></li><li class="listitem"><p>
	  454 pyro-sequencing (GS20, FLX or Titanium)
	</p></li><li class="listitem"><p>
	  Ion Torrent
	</p></li><li class="listitem"><p>
	  Solexa (Illumina) sequencing
	</p></li><li class="listitem"><p>
	  (in development) Pacific Biosciences sequencing
	</p></li></ul></div><p>
      into contiguous sequences (called <span class="emphasis"><em>contigs</em></span>). One can
      use the sequences of different sequencing technologies either in a
      single assembly run (a <span class="emphasis"><em>true hybrid assembly</em></span>) or by
      mapping one type of data to an assembly of other sequencing type (a
      <span class="emphasis"><em>semi-hybrid assembly (or mapping)</em></span>) or by mapping a
      data against consensus sequences of other assemblies (a <span class="emphasis"><em>simple
      mapping</em></span>).
    </p><p>
      The MIRA acronym stands for <span class="bold"><strong>M</strong></span>imicking
      <span class="bold"><strong>I</strong></span>ntelligent <span class="bold"><strong>R</strong></span>ead <span class="bold"><strong>A</strong></span>ssembly
      and the program pretty well does what its acronym says (well, most of
      the time anyway). It is the Swiss army knife of sequence assembly that
      I've used and developed during the past 14 years to get assembly jobs I
      work on done efficiently - and especially accurately. That is, without
      me actually putting too much manual work into it.
    </p><p>
      Over time, other labs and sequencing providers have found MIRA useful
      for assembly of extremely 'unfriendly' projects containing lots of
      repetitive sequences. As always, your mileage may vary.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_wheretostartreading"></a>1.2. 
      What to read in this manual and where to start reading?
    </h2></div></div></div><p>
      At the last count, this manual had almost 200 pages and this might seem a little bit daunting.
      However, you very probably do not need to read everything.
    </p><p>
      You should read most of this introductional chapter though: e.g.,
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    the part with the MIRA quick tour
	  </p></li><li class="listitem"><p>
	    the part which gives a quick overview for which data sets to use MIRA and for which not
	  </p></li><li class="listitem"><p>
	    the part which showcases different features of MIRA (lots of screen shots!)
	  </p></li><li class="listitem"><p>
	    where and how to get help if things don't work out as you expected
	  </p></li></ul></div><p>
      After that, reading should depend on the type of data you intend to work
      with: there are specific chapters for assembly of de-novo, of mapping and
      of EST / RNASeq projects. They all contain an overview on how to
      define your data and how to launch MIRA for these data sets. There is
      also chapter on how to prepare data sets from specific sequencing
      technologies.
    </p><p>
      The chapter on working with results of MIRA should again be of general
      interest to everyone. It describes the structure of output directories
      and files and gives first pointers on what to find where. Also,
      converting results into different formats -- with and without filtering
      for specific needs -- is covered there.
    </p><p>
      As the previously cited chapters are more introductory in their nature,
      they do not go into the details of MIRA parametrisation. While MIRA has
      a comprehensive set of standard settings which should be suited for a
      majority of assembly tasks, the are more than 150 switches / parameters
      with which one can fine tune almost every aspect of an assembly. A
      complete description for each and every parameter and how to correctly
      set parameters for different use cases and sequencing technologies can
      be found in the reference chapter.
    </p><p>
      As not every assembly project is simple, there is also a chapter with
      tips on how to deal with projects which turn out to be "hard." It
      certainly helps if you at least skim through it even if you do not
      expect to have problems with your data ... it contains a couple of
      tricks on what one can see in result files as well as in temporary and
      log files which are not explained elsewhere.
    </p><p>
      MIRA comes with a number of additional utilities which are described in
      an own chapter. While the purpose of <span class="command"><strong>miraconvert</strong></span>
      should be quite clear quite quickly, the versatility of use cases for
      <span class="command"><strong>mirabait</strong></span> might surprise more than one. Be sure to
      check it out.
    </p><p>
      As from time to time some general questions on sequencing are popping up
      on the MIRA talk mailing list, I have added a chapter with some general
      musings on what to consider when going into sequencing projects. This
      should be in no way a replacement for an exhaustive talk with a
      sequencing provider, but it can give a couple of hints on what to take
      care of.
    </p><p>
      There is also a FAQ chapter with some of the more frequently asked questions
      which popped up in the past few years.
    </p><p>
      Finally, there are also chapters covering some more technical aspects of MIRA: the MAF format
      and structure / content of the tmp directory have own chapters.
    </p><p>
      Complete walkthroughs ... are lacking at the moment for MIRA 4. In the
      MIRA 3 manual I had them, but so many things have changed (at all
      levels: MIRA, the sequencing technologies, data repositories) that I did
      not have time to update them. I probably will need quite some time to
      write new ones. Feel free to send me some if you are inclined to help
      fellow scientists.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_intro_miraquicktour"></a>1.3. 
      The MIRA quick tour
    </h2></div></div></div><p>
      Input can be in various formats like Staden experiment (EXP), Sanger
      CAF, FASTA, FASTQ or PHD file. Ancillary data containing additional
      information helpful to the assembly as is contained in, e.g. NCBI
      traceinfo XML files or Staden EXP files, is also honoured. If present,
      base qualities in
      <span class="command"><strong>phred</strong></span> style and SCF signal electrophoresis trace
      files are used to adjudicate between or even correct contradictory
      stretches of bases in reads by either the integrated automatic EdIt
      editor (written by Thomas Pfisterer) or the assembler itself.
    </p><p>
      MIRA was conceived especially with the problem of repeats in genomic
      data and SNPs in transcript (EST / RNASeq) data in mind. Considerable
      effort was made to develop a number of strategies -- ranging from
      standard clone-pair size restrictions to discovery and marking of base
      positions discriminating the different repeats / SNPs -- to ensure that
      repetitive elements are correctly resolved and that misassemblies do not
      occur.
    </p><p>
      The resulting assembly can be written in different standard formats like
      CAF, Staden GAP4 directed assembly, ACE, HTML, FASTA, simple text or
      transposed contig summary (TCS) files. These can easily be imported into
      numerous finishing tools or further evaluated with simple scripts.
    </p><p>
      The aim of MIRA is to build the best possible assembly by
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  having a more or less full overview on the whole project at any time
	  of the assembly, i.e. knowledge of almost all possible read-pairs in
	  a project,
	</p></li><li class="listitem"><p>
	  using high confidence regions (HCRs) of several aligned read-pairs to
	  start contig building at a good anchor point of a contig, extending
	  clipped regions of reads on a 'can be justified' basis.
	</p></li><li class="listitem"><p>
	  using all available data present at the time of assembly, i.e.,
	  instead of relying on sequence and base confidence values only, the
	  assembler will profit from trace files containing electrophoresis
	  signals, tags marking possible special attributes of DNA,
	  information on specific insert sizes of read-pairs etc.
	</p></li><li class="listitem"><p>
	  having 'intelligent' contig objects accept or refuse reads based on
	  the rate of unexplainable errors introduced into the consensus
	</p></li><li class="listitem"><p>
	  learning from mistakes by discovering and analysing possible repeats
	  differentiated only by single nucleotide polymorphisms. The
	  important bases for discriminating different repetitive elements are
	  tagged and used as new information.
	</p></li><li class="listitem"><p>
	  using the possibility given by the integrated automatic editor to
	  correct errors present in contigs (and subsequently) reads by
	  generating and verifying complex error hypotheses through analysis
	  of trace signals in several reads covering the same area of a
	  consensus,
	</p></li><li class="listitem"><p>
	  iteratively extending reads (and subsequently) contigs based on
	</p><div class="orderedlist"><ol class="orderedlist" type="a"><li class="listitem"><p>
	      additional information gained by overlapping read pairs in contigs
	      and
	    </p></li><li class="listitem"><p>
	      corrections made by the automated editor.
	    </p></li></ol></div></li></ol></div><p>
    </p><p>
      MIRA was part of a bigger project that started at the DKFZ (Deutsches
      Krebsforschungszentrum, German Cancer Research Centre) Heidelberg in
      1997: the "Bundesministerium für Bildung, Wissenschaft, Forschung und
      Technologie" supported the PhD thesis of Thomas and myself by grant
      number <span class="emphasis"><em>01 KW 9611</em></span>. Beside an assembler to tackle
      difficult repeats, the grant also supported the automated editor /
      finisher EdIt package -- written by Thomas Pfisterer. The strength of
      MIRA and EdIt is the automatic interaction of both packages which
      produces assemblies with less work for human finishers to be done.
    </p><p>
      I'd like to thank everybody who reported bugs to me, pointed out problems,
      sent ideas and suggestions they encountered while using the predecessors.
      Please continue to do so, the feedback made this third version possible.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_for_which_data_sets_to_use_mira_and_for_which_not"></a>1.4. 
      For which data sets to use MIRA and for which not
    </h2></div></div></div><p>
      As a general rule of thumb: if you have an organism with more than
      100 to 150 megabases or more than 20 to 40 million reads, you might want
      to try other assemblers first.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect3_genome_denovo"></a>1.4.1. 
	Genome de-novo
      </h3></div></div></div><p>
	For genome assembly, the version 3 series of MIRA have been reported
	to work on projects with something like a million Sanger reads (~80 to
	100 megabases at 10x coverage), five to ten million 454 Titanium reads
	(~100 megabases at 20x coverage) and 20 to 40 million Solexa reads
	(enough for de-novo of a bacterium or a small eukaryote with 76mers or
	100mers).
      </p><p>
	Provided you have the memory, MIRA is expected to work in de-novo
	mode with
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    Sanger reads: 5 to 10 million
	  </p></li><li class="listitem"><p>
	    454 reads: 5 to 15 million
	  </p></li><li class="listitem"><p>
	    Ion Torrent reads: 5 to 15 million
	  </p></li><li class="listitem"><p>
	    Solexa reads: 15 to 20 million
	  </p></li></ul></div><p>
	and "normal" coverages, whereas "normal" would be at no more than 50x
	to 70x for genome projects. Higher coverages will also work, but may
	create somewhat larger temporary files without heavy
	parametrisation. Lower coverages (&lt;4x for Sanger, &lt;10x for 454,
	&lt; 10x for IonTorrent) also need special attention in the
	parameter settings.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_genome_mapping"></a>1.4.2. 
	Genome mapping
      </h3></div></div></div><p>
	As the complexity of mapping is a lot lower than de-novo, one can
	basically double (perhaps even triple) the number of reads compared to
	'de-novo'. The limiting factor will be the amount of RAM though, and
	MIRA will also need lots of it if you go into eukaryotes.
      </p><p>
	The main limiting factor regarding time will be the number of
	reference sequences (backbones) you are using. MIRA being pedantic
	during the mapping process, it might be a rather long wait if you have
	more than 500 to 1000 reference sequences.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect3_ests_rnaseq"></a>1.4.3. 
	ESTs / RNASeq
      </h3></div></div></div><p>
	The default values for MIRA should allow it to work with many EST and
	RNASeq data sets, sometimes even from non-normalised libraries. For
	extreme coverage cases however (like, something with a lot of cases at
	and above 10k coverage), one would perhaps want to resort to data
	reduction routines before feeding the sequences to MIRA.
      </p><p>
	On the other hand, recent developments of MIRA were targeted at making
	de-novo RNASeq assembly of non-normalised libraries liveable, and
	indeed I now regularly use MIRA for data sets with up to 50 million
	Illumina 100bp reads.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_intro_specialfeatures"></a>1.5. 
      Any special features I might be interested in?
    </h2></div></div></div><p>
      A few perhaps.
    </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	The screen shots in this section show data from assemblies produced
	with MIRA, but the visualisation itself is done in a finishing program
	named <span class="command"><strong>gap4</strong></span>.
      </p><p>
	Some of the screen shots were edited for showing a special feature of
	MIRA. E.g., in the screen shots with Solexa data, quite some reads were
	left out of the view pane as else -- due to the amount of data --
	these screen shots would need several pages for a complete printout.
      </p></td></tr></table></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_miradiscernsrepeats"></a>1.5.1. 
	MIRA learns to discern non-perfect repeats, leading to better assemblies
      </h3></div></div></div><p>
	MIRA is an iterative assembler (it works in several passes) and acts a
	bit like a child when exploring the world: it explores the assembly
	space and is specifically parameterised to allow a couple of assembly
	errors during the first passes. But after each pass some routines (the
	"parents", if you like) check the result, searching for assembly
	errors and deduce knowledge about specific assemblies MIRA should not
	have ventured into. MIRA will then prevent these errors to re-occur in
	subsequent passes.
      </p><p>
	As an example, consider the following multiple alignment:
      </p><div class="figure"><a name="chap_intro::srmc_in_454sxahyb_1stpass.png"></a><p class="title"><b>Figure 1.1. How MIRA learns from misassemblies (1). Multiple alignment
	after 1st pass with an obvious assembly error, notice the clustered
	columns discrepancies. Two slightly different repeats were assembled
	together.</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/srmc_in_454sxahyb_1stpass.png" width="100%" alt="How MIRA learns from misassemblies (1). Multiple alignment after 1st pass with an obvious assembly error, notice the clustered columns discrepancies. Two slightly different repeats were assembled together."></td></tr></table></div></div></div><br class="figure-break"><p>
	These kind of errors can be easily spotted by a human, but are hard to
	prevent by normal alignment algorithms as sometimes there's only one
	single base column difference between repeats (and not several as in
	this example).
      </p><p>
	MIRA spots these things (even if it's only a single column), tags the
	base positions in the reads with additional information and then will
	use that information in subsequent passes. The net effect is shown in
	the next two figures:
      </p><div class="figure"><a name="chap_intro::srmc_in_454sxahyb_lastpass1.png"></a><p class="title"><b>Figure 1.2. 
	  Multiple alignment after last pass where assembly errors from
	  previous passes have been resolved (1st repeat site)
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/srmc_in_454sxahyb_lastpass1.png" width="100%" alt="Multiple alignment after last pass where assembly errors from previous passes have been resolved (1st repeat site)"></td></tr></table></div></div></div><br class="figure-break"><div class="figure"><a name="chap_intro::srmc_in_454sxahyb_lastpass2.png"></a><p class="title"><b>Figure 1.3. 
	  Multiple alignment after last pass where assembly errors from
	  previous passes have been resolved (2nd repeat site)
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/srmc_in_454sxahyb_lastpass2.png" width="100%" alt="Multiple alignment after last pass where assembly errors from previous passes have been resolved (2nd repeat site)"></td></tr></table></div></div></div><br class="figure-break"><p>
	The ability of MIRA to learn and discern non-identical repeats from
	each other through column discrepancies is nothing new. Here's the
	link to a paper from a talk I had at the German Conference on
	Bioinformatics in 1999: <a class="ulink" href="http://www.bioinfo.de/isb/gcb99/talks/chevreux/" target="_top">http://www.bioinfo.de/isb/gcb99/talks/chevreux/</a>
      </p><p>
	I'm sure you'll recognise the basic principle in figures 8 and 9. The
	slides from the corresponding talk also look very similar to the
	screen shots above:
      </p><div class="figure"><a name="chap_intro::gcb99_replocator.png"></a><p class="title"><b>Figure 1.4. 
	  Slides presenting the repeat locator at the GCB 99
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/gcb99_replocator.png" width="100%" alt="Slides presenting the repeat locator at the GCB 99"></td></tr></table></div></div></div><br class="figure-break"><p>
	You can get the talk with these slides here: <a class="ulink" href="http://chevreux.org/dkfzold/gcb99/bachvortrag_gcb99.ppt" target="_top">http://chevreux.org/dkfzold/gcb99/bachvortrag_gcb99.ppt</a>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_automatic_editors"></a>1.5.2. 
	MIRA has integrated editors for data from Sanger, 454, IonTorrent sequencing
      </h3></div></div></div><p>
	Since the first versions in 1999, the <span class="emphasis"><em>EdIt</em></span>
	automatic Sanger sequence editor from Thomas Pfisterer has been
	integrated into MIRA.
      </p><div class="figure"><a name="chap_intro::gcb99_edit.png"></a><p class="title"><b>Figure 1.5. 
	  Slides presenting the Edit automatic Sanger editor at the GCB 99
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/gcb99_edit.png" width="100%" alt="Slides presenting the Edit automatic Sanger editor at the GCB 99"></td></tr></table></div></div></div><br class="figure-break"><p>
	The routines use a combination of hypothesis generation/testing
	together with neural networks (trained on ABI and ALF traces) for
	signal recognition to discern between base calling errors and true
	multiple alignment differences. They go back to the trace data to
	resolve potential conflicts and eventually recall bases using the
	additional information gained in a multiple alignment of reads.
      </p><div class="figure"><a name="chap_intro::san_autoedit1.png"></a><p class="title"><b>Figure 1.6. 
	  Sanger assembly without EdIt automatic editing routines. The bases
	  with blue background are base calling errors.
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/san_autoedit1.png" width="100%" alt="Sanger assembly without EdIt automatic editing routines. The bases with blue background are base calling errors."></td></tr></table></div></div></div><br class="figure-break"><div class="figure"><a name="chap_intro::san_autoedit2.png"></a><p class="title"><b>Figure 1.7. 
	  Sanger assembly with EdIt automatic editing routines. Bases with
	  pink background are corrections made by EdIt after assessing the
	  underlying trace files (SCF files in this case). Bases with blue
	  background are base calling errors where the evidence in the trace
	  files did not show enough evidence to allow an editing correction.
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/san_autoedit2.png" width="100%" alt="Sanger assembly with EdIt automatic editing routines. Bases with pink background are corrections made by EdIt after assessing the underlying trace files (SCF files in this case). Bases with blue background are base calling errors where the evidence in the trace files did not show enough evidence to allow an editing correction."></td></tr></table></div></div></div><br class="figure-break"><p>
	With the introduction of 454 reads, MIRA also got in 2007 specialised
	editors to search and correct for typical 454 sequencing problems like
	the homopolymer run over-/undercalls. These editors are now integrated
	into MIRA itself and are not part of EdIt anymore.
      </p><p>
	While not being paramount to the assembly quality, both editors
	provide additional layers of safety for the MIRA learning algorithm to
	discern non-perfect repeats even on a single base
	discrepancy. Furthermore, the multiple alignments generated by these
	two editors are way more pleasant to look at (or automatically
	analyse) than the ones containing all kind of gaps, insertions,
	deletions etc.pp.
      </p><div class="figure"><a name="chap_intro::454_autoedit1.png"></a><p class="title"><b>Figure 1.8. 
	  454 assembly without 454 automatic editing routines
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/454_autoedit1.png" width="100%" alt="454 assembly without 454 automatic editing routines"></td></tr></table></div></div></div><br class="figure-break"><div class="figure"><a name="chap_intro::454_autoedit2.png"></a><p class="title"><b>Figure 1.9. 
	  454 assembly with 454 automatic editing routines
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/454_autoedit2.png" width="100%" alt="454 assembly with 454 automatic editing routines"></td></tr></table></div></div></div><br class="figure-break"></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_whycontigsend"></a>1.5.3. 
	MIRA lets you see why contigs end where they end
      </h3></div></div></div><p>
	A very useful feature for finishing are kmer (hash) frequency tags
	which MIRA sets in the assembly. Provided your finishing editor
	understands those tags
	(<span class="command"><strong>gap4</strong></span>, <span class="command"><strong>gap5</strong></span>
	and <span class="command"><strong>consed</strong></span> are fine but there may be others),
	they'll give you precious insight where you might want to be cautious
	when joining to contigs or where you would need to perform some primer
	walking. MIRA colourises the assembly with the hash frequency (HAF)
	tags to show repetitiveness.
      </p><p>
	You will need to read about the HAF tags in the reference manual, but
	in a nutshell: the HAF5, HAF6 and HAF7 tags tell you potentially have
	repetitive to very repetitive read areas in the genome, while HAF2
	tags will tell you that these areas in the genome have not been
	covered as well as they should have been.
      </p><p>
	As an example, the following figure shows the coverage of a contig.
      </p><div class="figure"><a name="chap_intro::haf5_haf2_contigcoverage_ovals.png"></a><p class="title"><b>Figure 1.10. 
	  Coverage of a contig.
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/haf5_haf2_contigcoverage_ovals.png" width="100%" alt="Coverage of a contig."></td></tr></table></div></div></div><br class="figure-break"><p>
	The question is now: why did MIRA stop building this contig on the
	left end (left oval) and why on the right end (right oval).
      </p><p>
	Looking at the HAF tags in the contig, the answer becomes quickly
	clear: the left contig end has HAF5 tags in the reads (shown in bright
	red in the following figure). This tells you that MIRA stopped because
	it probably could not unambiguously continue building this
	contig. Indeed, if you BLAST the sequence at the NCBI, you will find
	out that this is an rRNA area of a bacterium, of which bacteria
	normally have several copies in the genome:
      </p><div class="figure"><a name="chap_intro::haf5_repend_rrna.png"></a><p class="title"><b>Figure 1.11. 
	  HAF5 tags (reads shown with red background) covering a contig end
	  show repetitiveness as reason for stopping a contig build.
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/haf5_repend_rrna.png" width="100%" alt="HAF5 tags (reads shown with red background) covering a contig end show repetitiveness as reason for stopping a contig build."></td></tr></table></div></div></div><br class="figure-break"><p>
	The right end of the same contig however ends in HAF3 tags (normal
	coverage, bright green in the next figure) and even HAF2 tags (below
	average coverage, pale green in the next image). This tells you MIRA
	stopped building the contig at this place simply because there were
	no more reads to continue. This is a perfect target for primer
	walking if you want to finish a genome.
      </p><div class="figure"><a name="chap_intro::haf2_end_nomoredata.png"></a><p class="title"><b>Figure 1.12. 
	  HAF2 tags covering a contig end show that no more reads were
	  available for assembly at this position.
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/haf2_end_nomoredata.png" width="100%" alt="HAF2 tags covering a contig end show that no more reads were available for assembly at this position."></td></tr></table></div></div></div><br class="figure-break"></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_stmshybrid_tags"></a>1.5.4. 
	MIRA tags problematic decisions in hybrid assemblies
      </h3></div></div></div><p>
	Many people combine Sanger &amp; 454 -- or nowadays more 454 &amp;
	Solexa -- to improve the sequencing quality of their project through
	two (or more) sequencing technologies. To reduce time spent in
	finishing, MIRA automatically tags those bases in a consensus of a
	hybrid assembly where reads from different sequencing technologies
	severely contradict each other.
      </p><p>
	The following example shows a hybrid 454 / Solexa assembly where reads
	from 454 (highlighted read names in following figure) were not sure
	whether to have one or two "G" at a certain position. The consensus
	algorithm would have chosen "two Gs" for 454, obviously a wrong
	decision as all Solexa reads at the same spot (the reads which are not
	highlighted) show only one "G" for the given position. While MIRA
	chose to believe Solexa in this case, it tagged the position anyway in
	case someone chooses to check these kind of things.
      </p><div class="figure"><a name="chap_intro::454sxa_stms_hybdenovo.png"></a><p class="title"><b>Figure 1.13. 
	  A "STMS" tag (Sequencing Technology Mismatch Solved, the black
	  square base in the consensus) showing a potentially difficult
	  decision in a hybrid 454 / Solexa de-novo assembly.
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/454sxa_stms_hybdenovo.png" width="100%" alt='A "STMS" tag (Sequencing Technology Mismatch Solved, the black square base in the consensus) showing a potentially difficult decision in a hybrid 454 / Solexa de-novo assembly.'></td></tr></table></div></div></div><br class="figure-break"><p>
	This works also for other sequencing technology combinations or in
	mapping assemblies. The following is an example in a hybrid Sanger /
	454 project where by pure misfortune, all Sanger reads have a base
	calling error at a given position while the 454 reads show the true
	sequence.
      </p><div class="figure"><a name="chap_intro::454san_stmu_hybdenovo.png"></a><p class="title"><b>Figure 1.14. 
	  A "STMU" tag (Sequencing Technology Mismatch Unresolved, light blue
	  square in the consensus at lower end of large oval) showing a
	  potentially difficult decision in a hybrid Sanger / 454 mapping
	  assembly.
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/454san_stmu_hybdenovo.png" width="100%" alt='A "STMU" tag (Sequencing Technology Mismatch Unresolved, light blue square in the consensus at lower end of large oval) showing a potentially difficult decision in a hybrid Sanger / 454 mapping assembly.'></td></tr></table></div></div></div><br class="figure-break"></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_cer_reads"></a>1.5.5. 
	MIRA allows older finishing programs to cope with amount data in Solexa
	mapping projects
      </h3></div></div></div><p>
	Quality control is paramount when you do mutation analysis for
	biologists: I know they'll be on my doorstep the very next minute they
	found out one of the SNPs in the resequencing data wasn't a SNP, but a
	sequencing artefact. And I can understand them: why should they invest
	-- per SNP -- hours in the wet lab if I can invest a couple of minutes
	to get them data false negative rates (and false discovery rates) way
	below 1%? So, finishing and quality control for any mapping project is
	a must.
      </p><p>
	Both <span class="command"><strong>gap4</strong></span> and <span class="command"><strong>consed</strong></span> start to
	have a couple of problems when projects have millions of reads: you
	need lots of RAM and scrolling around the assembly gets a test to your
	patience. Still, these two assembly finishing programs are amongst the
	better ones out there, although <span class="command"><strong>gap5</strong></span> starts to
	quickly arrive in a state in which it allows itself to substitute to
	<span class="command"><strong>gap4</strong></span>.
      </p><p>
	So, MIRA reduces the number of reads in Solexa mapping projects
	without sacrificing information on coverage. The principle is pretty
	simple: for 100% matching reads, MIRA tracks coverage of every
	reference base and creates long synthetic, coverage equivalent reads
	(CERs) in exchange for the Solexa reads. Reads that do not match 100%
	are kept as own entities, so that no information gets lost. The
	following figure illustrates this:
      </p><div class="figure"><a name="chap_intro::sxa_cer_reads1.png"></a><p class="title"><b>Figure 1.15. 
	    Coverage equivalent reads (CERs) explained.
	  <p>
	    Left side of the figure: a conventional mapping with eleven reads
	    of size 4 against a consensus (in uppercase). The inversed base in
	    the lowest read depicts a sequencing error.
	  </p>
	  <p>
	    Right side of the figure: the same situation, but with coverage
	    equivalent reads (CERs). Note that there are less reads, but no
	    information is lost: the coverage of each reference base is
	    equivalent to the left side of the figure and reads with
	    differences to the reference are still present.
	  </p>
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_cer_reads1.png" width="100%" alt="Coverage equivalent reads (CERs) explained. Left side of the figure: a conventional mapping with eleven reads of size 4 against a consensus (in uppercase). The inversed base in the lowest read depicts a sequencing error. Right side of the figure: the same situation, but with coverage equivalent reads (CERs). Note that there are less reads, but no information is lost: the coverage of each reference base is equivalent to the left side of the figure and reads with differences to the reference are still present."></td></tr></table></div></div></div><br class="figure-break"><p>
	This strategy is very effective in reducing the size of a project. As
	an example, in a mapping project with 9 million Solexa 36mers, MIRA
	created a project with 1.7m reads: 700k CER reads representing ~8
	million 100% matching Solexa reads, and it kept ~950k mapped reads as
	they had &#8805; mismatch (be it sequencing error or true SNP) to the
	reference. A reduction of 80%, and numbers for mapping projects with
	Solexa 100bp reads are in a similar range.
      </p><p>
	Also, mutations of the resequenced strain now really stand out in the
	assembly viewer as the following figure shows:
      </p><div class="figure"><a name="chap_intro::sxa_cer_reads2.png"></a><p class="title"><b>Figure 1.16. 
	  Coverage equivalent reads let SNPs become very visible in assembly viewers
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_cer_reads2.png" width="100%" alt="Coverage equivalent reads let SNPs become very visible in assembly viewers"></td></tr></table></div></div></div><br class="figure-break"></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_mapping_tags"></a>1.5.6. 
	MIRA tags SNPs and other features, outputs result files
	for biologists
      </h3></div></div></div><p>
	Want to assemble two or several very closely related genomes without
	reference, but finding SNPs or differences between them?
      </p><p>
	Tired of looking at some text output from mapping programs and
	guessing whether a SNP is really a SNP or just some random junk?
      </p><p>
	MIRA tags all SNPs (and other features like missing coverage etc.) it
	finds so that -- when using a finishing viewer like gap4 or consed --
	one can quickly jump from tag to tag and perform quality control. This
	works both in de-novo assembly and in mapping assembly, all MIRA needs
	is the information which read comes from which strain.
      </p><p>
	The following figure shows a mapping assembly of Solexa 36mers against
	a bacterial reference sequence, where a mutant has an indel position
	in an gene:
      </p><div class="figure"><a name="chap_intro::sxa_sroc_lenski2.png"></a><p class="title"><b>Figure 1.17. 
	  "SROc" tag (Snp inteR Organism on Consensus) showing a SNP position
	  in a Solexa mapping assembly.
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_sroc_lenski2.png" width="100%" alt='"SROc" tag (Snp inteR Organism on Consensus) showing a SNP position in a Solexa mapping assembly.'></td></tr></table></div></div></div><br class="figure-break"><p>
	Other interesting places like deletions of whole genome parts are also
	directly tagged by MIRA and noted in diverse result files (and
	searchable in assembly viewers):
      </p><div class="figure"><a name="chap_intro::sxa_mcvc_lenski.png"></a><p class="title"><b>Figure 1.18. 
	  "MCVc" tag (Missing CoVerage in Consensus, dark red stretch in figure)
	  showing a genome deletion in Solexa mapping assembly.
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_mcvc_lenski.png" width="100%" alt='"MCVc" tag (Missing CoVerage in Consensus, dark red stretch in figure) showing a genome deletion in Solexa mapping assembly.'></td></tr></table></div></div></div><br class="figure-break"><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	For bacteria -- and if you use annotated GenBank files as reference
	sequence -- MIRA will also output some nice lists directly usable (in
	Excel) by biologists, telling them which gene was affected by what
	kind of SNP, whether it changes the protein, the original and the
	mutated protein sequence etc.pp.
      </td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_miramuchmore"></a>1.5.7. 
	MIRA has ... much more
      </h3></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    Extensive possibilities to clip data if needed: by quality, by
	    masked bases, by A/T stretches, by evidence from other reads, ...
	  </p></li><li class="listitem"><p>
	    Routines to re-extend reads into clipped parts if multiple
	    alignment allows for it.
	  </p></li><li class="listitem"><p>
	    Read in ancillary data in different formats: EXP, NCBI TRACEINFO
	    XML, SSAHA2, SMALT result files and text files.
	  </p></li><li class="listitem"><p>
	    Detection of chimeric reads.
	  </p></li><li class="listitem"><p>
	    Pipeline to discover SNPs in ESTs from different strains
	    (miraSearchESTSNPs)
	  </p></li><li class="listitem"><p>
	    Support for many different of input and output formats (FASTA,
	    EXP, FASTQ, CAF, MAF, ...)
	  </p></li><li class="listitem"><p>
	    Automatic memory management (when RAM is tight)
	  </p></li><li class="listitem"><p>
	    Over 150 parameters to tune the assembly for a lot of use cases,
	    many of these parameters being tunable individually depending on
	    sequencing technology they apply to.
	  </p></li></ul></div><p>
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_intro_versions_licenses_disclaimer_and_copyright"></a>1.6. 
      Versions, Licenses, Disclaimer and Copyright
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_versions"></a>1.6.1. 
	Versions
      </h3></div></div></div><p>
	There are two kind of versions for MIRA that can be compiled form
	source files: production and development.
      </p><p>
	Production versions are from the stable branch of the source code. These
	versions are available for download from SourceForge.
      </p><p>
	Development versions are from the development branch of the source
	tree. These are also made available to the public and should be
	compiled by users who want to test out new functionality or to track
	down bugs or errors that might arise at a given location. Release
	candidates (rc) also fall into the development versions: they are
	usually the last versions of a given development branch before being
	folded back into the production branch.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_licenses"></a>1.6.2. 
	License
      </h3></div></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_intro_licensemira"></a>1.6.2.1. 
	  MIRA
	</h4></div></div></div><p>
	  MIRA has been put under the GPL version 2.
	</p><p>
	  This program is free software; you can redistribute it and/or modify
	  it under the terms of the GNU General Public License as published by
	  the Free Software Foundation; either version 2 of the License, or (at
	  your option) any later version.
	</p><p>
	  This program is distributed in the hope that it will be useful, but
	  WITHOUT ANY WARRANTY; without even the implied warranty of
	  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	  General Public License for more details.
	</p><p>
	  You should have received a copy of the GNU General Public License
	  along with this program; if not, write to the Free Software
	  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
	  02110-1301, USA
	</p><p>
	  You may also visit <a class="ulink" href="http://www.opensource.org/licenses/gpl-2.0.php" target="_top">http://www.opensource.org/licenses/gpl-2.0.php</a> at the Open
	  Source Initiative for a copy of this licence.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_intro_licensedocs"></a>1.6.2.2. 
	  Documentation
	</h4></div></div></div><p>
	  The documentation pertaining to MIRA is licensed under the Creative
	  Commons Attribution-NonCommercial-ShareAlike 3.0 Unported
	  License. To view a copy of this license, visit <a class="ulink" href="http://creativecommons.org/licenses/by-nc-sa/3.0/" target="_top">http://creativecommons.org/licenses/by-nc-sa/3.0/</a> or send a
	  letter to Creative Commons, 171 Second Street, Suite 300, San
	  Francisco, California, 94105, USA.
	</p></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_copyright"></a>1.6.3. 
	Copyright
      </h3></div></div></div><p>
	© 1997-2000 Deutsches Krebsforschungszentrum Heidelberg -- Dept.
	of Molecular Biophysics and Bastien Chevreux (for MIRA) and Thomas
	Pfisterer (for EdIt)
      </p><p>
	© 2001-2014 Bastien Chevreux.
      </p><p>
	All rights reserved.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_external_libraries"></a>1.6.4. 
	External libraries
      </h3></div></div></div><p>
	MIRA uses the excellent Expat library to parse XML files. Expat is Copyright
	©  1998, 1999, 2000 Thai Open Source Software Center Ltd and Clark
	Cooper as well as Copyright ©
	2001, 2002 Expat maintainers.
      </p><p>
	See <a class="ulink" href="http://www.libexpat.org/" target="_top">http://www.libexpat.org/</a> and
	<a class="ulink" href="http://sourceforge.net/projects/expat/" target="_top">http://sourceforge.net/projects/expat/</a> for more information on Expat.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_intro_getting_help___mailing_lists___reporting_bugs"></a>1.7. 
      Getting help / Mailing lists / Reporting bugs
    </h2></div></div></div><p>
      Please try to find an answer to your question by first reading the
      documents provided with the MIRA package (FAQs, READMEs, usage guide,
      guides for specific sequencing technologies etc.). It's a lot, but then
      again, they hopefully should cover 90% of all questions.
    </p><p>
      If you have a tough nut to crack or simply could not find what you were
      searching for, you can subscribe to the MIRA talk mailing list and send
      in your question (or comment, or suggestion), see <a class="ulink" href="http://www.chevreux.org/mira_mailinglists.html" target="_top">http://www.chevreux.org/mira_mailinglists.html</a> for more
      information on that. Now that the number of subscribers has reached a
      good level, there's a fair chance that someone could answer your
      question before I have the opportunity or while I'm away from mail for a
      certain time.
    </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	Please very seriously consider using the mailing list before mailing
	me directly. Every question which can be answered by participants of
	the list is time I can invest in development and documentation of
	MIRA. I have a day job as bioinformatician which has nothing to do
	with MIRA and after work hours are rare enough nowadays.
      </p><p>
	Furthermore, Google indexes the mailing list and every discussion /
	question asked on the mailing list helps future users as they show up
	in Google searches.
      </p><p>
	Only mail me directly (bach@chevreux.org) if you feel that there's
	some information you absolutely do not want to share publicly.
      </p></td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      Subscribing to the list <span class="emphasis"><em>before sending mails to it </em></span>
      is necessary as messages from non-subscribers will be stopped by the
      system to keep the spam level low.
    </td></tr></table></div><p>
      To report bugs or ask for new features, please use the SourceForge
      ticketing system at: <a class="ulink" href="http://sourceforge.net/p/mira-assembler/tickets/" target="_top">http://sourceforge.net/p/mira-assembler/tickets/</a>. This ensures
      that requests do not get lost <span class="bold"><strong>and</strong></span> you
      get the additional benefit to automatically know when a bug has been
      fixed as I will not send separate emails, that's what bug trackers are
      there for.
    </p><p>
      Finally, new or intermediate versions of MIRA will be announced on the
      separate MIRA announce mailing list. Traffic is very low there as the
      only one who can post there is me. Subscribe if you want to be informed
      automatically on new releases of MIRA.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_intro_author"></a>1.8. 
      Author
    </h2></div></div></div><p>
      Bastien Chevreux (mira): <code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code>
    </p><p>
      WWW:  <a class="ulink" href="http://www.chevreux.org/" target="_top">http://www.chevreux.org/</a>
    </p><p>
      MIRA can use automatic editing routines for Sanger sequences which were
      written by Thomas Pfisterer (EdIt):
      <code class="email">&lt;<a class="email" href="mailto:t.pfisterer@dkfz-heidelberg.de">t.pfisterer@dkfz-heidelberg.de</a>&gt;</code>
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_intro_miscellaneous"></a>1.9. 
      Miscellaneous
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_citations"></a>1.9.1. 
	Citing MIRA
      </h3></div></div></div><p>
	Please use these citations:
      </p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	    For <span class="command"><strong>mira</strong></span>
	  </span></dt><dd><p>
	      Chevreux, B., Wetter, T. and Suhai, S. (1999): <span class="emphasis"><em>Genome
	      Sequence Assembly Using Trace Signals and Additional Sequence
	      Information</em></span>. Computer Science and Biology:
	      Proceedings of the German Conference on Bioinformatics (GCB) 99,
	      pp. 45-56.
	    </p></dd><dt><span class="term">
	    For <span class="command"><strong>miraSearchESTSNPs</strong></span> (was named
	    <span class="command"><strong>miraEST</strong></span> in earlier times)
	  </span></dt><dd><p> Chevreux, B., Pfisterer, T., Drescher, B., Driesel, A. J.,
	    Müller, W. E., Wetter, T. and Suhai, S. (2004): <span class="emphasis"><em>Using
	    the miraEST Assembler for Reliable and Automated mRNA Transcript
	    Assembly and SNP Detection in Sequenced ESTs</em></span>. Genome
	    Research, 14(6)
	    </p></dd></dl></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_intro_postcards_gold_and_jewellery"></a>1.9.2. 
	Postcards, gold and jewellery
      </h3></div></div></div><p>
	If you find this software useful, please send the author a postcard. If
	postcards are not available, a treasure chest full of Spanish doubloons, gold
	and jewellery will do nicely, thank you.
      </p></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_installation"></a>Chapter 2. Installing MIRA</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_install_wheretofetch">2.1. 
      Where to fetch MIRA
    </a></span></dt><dt><span class="sect1"><a href="#sect_install_precompiledbinary">2.2. 
      Installing from a precompiled binary package
    </a></span></dt><dt><span class="sect1"><a href="#sect_install_third_party_integration">2.3. 
      Integration with third party programs (gap4, consed)
    </a></span></dt><dt><span class="sect1"><a href="#sect_install_compiling">2.4. 
      Compiling MIRA yourself
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_install_comp_prereq">2.4.1. 
	Prerequisites
      </a></span></dt><dt><span class="sect2"><a href="#sect_install_comp_comp">2.4.2. 
	Compiling and installing
      </a></span></dt><dt><span class="sect2"><a href="#sect_install_comp_conf">2.4.3. 
	Configure switches for MIRA
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_install_comp_conf_boost">2.4.3.1. 
	  BOOST configure switches for MIRA
	</a></span></dt><dt><span class="sect3"><a href="#sect_install_comp_conf_mira">2.4.3.2. 
	  MIRA specific configure switches
	</a></span></dt></dl></dd></dl></dd><dt><span class="sect1"><a href="#sect_install_walkthroughs">2.5. 
      Installation walkthroughs
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_install_walkthroughs_kubuntu">2.5.1. 
	(K)Ubuntu 12.04
      </a></span></dt><dt><span class="sect2"><a href="#sect_install_walkthroughs_opensuse">2.5.2. 
	openSUSE 12.1
      </a></span></dt><dt><span class="sect2"><a href="#sect_install_walkthroughs_fedora">2.5.3. 
	Fedora 17
      </a></span></dt><dt><span class="sect2"><a href="#sect_install_walkthroughs_allfromscratch">2.5.4. 
	Compile everything from scratch
      </a></span></dt><dt><span class="sect2"><a href="#sect_install_walkthroughs_dynamic">2.5.5. 
	Dynamically linked MIRA
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_install_hintotherplatforms">2.6. 
      Compilation hints for other platforms.
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_install_hintosx">2.6.1. 
	Mac OS X
      </a></span></dt><dt><span class="sect2"><a href="#sect_install_hintnetbsd5">2.6.2. 
	NetBSD 5 (i386)
      </a></span></dt></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">A problem can be found to almost every solution.
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_install_wheretofetch"></a>2.1. 
      Where to fetch MIRA
    </h2></div></div></div><p>
      SourceForge: <a class="ulink" href="http://sourceforge.net/projects/mira-assembler/" target="_top">http://sourceforge.net/projects/mira-assembler/</a>
    </p><p>
      There you will normally find a couple of precompiled binaries -- usually
      for Linux and Mac OSX -- or the source package for compiling yourself.
    </p><p>
      Precompiled binary packages are named in the following way:
    </p><p>
      <code class="filename">mira_<em class="replaceable"><code>miraversion</code></em>_<em class="replaceable"><code>OS-and-binarytype</code></em>.tar.bz2</code>
    </p><p>
      where
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	  For <code class="filename"><em class="replaceable"><code>miraversion</code></em></code>, the
	  stable versions of MIRA with the general public as audience usually
	  have a version number in three parts, like
	  <code class="filename">3.0.5</code>, sometimes also followed by some postfix
	  like in <code class="filename">3.2.0rc1</code> to denote release candidate 1
	  of the 3.2.0 version of MIRA. On very rare occasions, stable
	  versions of MIRA can have four part like in, e.g.,
	  <code class="filename">3.4.0.1</code>: these versions create identical
	  binaries to their parent version (<code class="filename">3.4.0</code>) and
	  just contains fixes to the source build machinery.
	</p><p>
	  The version string sometimes can have a different format:
	  <code class="filename"><span class="emphasis"><em>sometext</em></span>-0-g<span class="emphasis"><em>somehexnumber</em></span></code>
	  like in, e.g.,
	  <code class="filename">ft_fastercontig-0-g4a27c91</code>. These versions of
	  MIRA are snapshots from the development tree of MIRA and usually
	  contain new functionality which may not be as well tested as the
	  rest of MIRA, hence contains more checks and more debugging output
	  to catch potential errors
	</p></li><li class="listitem"><p>
	  <code class="filename"><em class="replaceable"><code>OS-and-binarytype</code></em></code>
	  finally defines for which operating system and which processor class
	  the package is destined. E.g.,
	  <code class="filename">linux-gnu_x86_64_static</code> contains static
	  binaries for Linux running a 64 bit processor.
	</p></li></ul></div><p>
      Source packages are usually named
    </p><p>
      <code class="filename">mira-<em class="replaceable"><code>miraversion</code></em>.tar.bz2</code>
    </p><p>
      Examples for packages at SourceForge:
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><code class="filename">mira_3.0.5_prod_linux-gnu_x86_64_static.tar.bz2</code></li><li class="listitem"><code class="filename">mira_3.0.5_prod_linux-gnu_i686_32_static.tar.bz2</code></li><li class="listitem"><code class="filename">mira_3.0.5_prod_OSX_snowleopard_x86_64_static.tar.bz2</code></li><li class="listitem"><code class="filename">mira-3.0.5.tar.bz2</code></li></ul></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_install_precompiledbinary"></a>2.2. 
      Installing from a precompiled binary package
    </h2></div></div></div><p>
      Download the package, unpack it. Inside, there is -- beside other
      directories -- a <code class="filename">bin</code>. Copy or move the files and
      soft-links inside this directory to a directory in your $PATH variable.
    </p><p>
      Additional scripts for special purposes are in the
      <code class="filename">scripts</code> directory. You might or might not want to
      have them in your $PATH.
    </p><p>
      Scripts and programs for MIRA from other authors are in the
      <code class="filename">3rdparty</code> directory. Here too, you may or may not
      want to have (some of them) in your $PATH.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_install_third_party_integration"></a>2.3. 
      Integration with third party programs (gap4, consed)
    </h2></div></div></div><p>
      MIRA sets tags in the assemblies that can be read and interpreted by the
      Staden <span class="command"><strong>gap4</strong></span> package or
      <span class="command"><strong>consed</strong></span>. These tags are extremely useful to
      efficiently find places of interest in an assembly (be it de-novo or
      mapping), but both <span class="command"><strong>gap4</strong></span> and <span class="command"><strong>consed</strong></span>
      need to be told about these tags.
    </p><p>
      Data files for a correct integration are delivered in the
      <code class="filename">support</code> directory of the distribution. Please
      consult the README in that directory for more information on how to
      integrate this information in either of these packages.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_install_compiling"></a>2.4. 
      Compiling MIRA yourself
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_install_comp_prereq"></a>2.4.1. 
	Prerequisites
      </h3></div></div></div><p>
	The MIRA 3.x series works with quite old systems, the upcoming 4.x
	series will need a C++11 compatible tool chain, i.e., systems starting
	from the later half of 2011 should be OK. The requisites for
	<span class="emphasis"><em>compiling</em></span> MIRA are:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    gcc &#8805; 4.8.1, with libstdc++6. You really want to use a simple
	    installation package pre-configured for your system, but in case you
	    want or have to install gcc yourself, please refer to <a class="ulink" href="http://gcc.gnu.org/" target="_top">http://gcc.gnu.org/</a> for more information on the GNU compiler
	    collection.
	  </p></li><li class="listitem"><p>
	    BOOST library &#8805; 1.48. Lower versions might work, but
	    untested. You would need to change the checking in the configure
	    script for this to run through. You really want to use a simple
	    installation package pre-configured for your system, but in case you
	    want or have to install BOOST yourself, please refer to <a class="ulink" href="http://www.boost.org/" target="_top">http://www.boost.org/</a> for more information on the BOOST
	    library.
	  </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	    Do NOT use a so called <span class="emphasis"><em>staged</em></span> BOOST library,
	    only a fully installed library will work at the moment
	  </td></tr></table></div></li><li class="listitem">
	  zlib. Should your system not have zlib installed or available as
	  simple installation package, please see <a class="ulink" href="http://www.zlib.net/" target="_top">http://www.zlib.net/</a> for more information regarding zlib.
	</li><li class="listitem">
	  GNU make. Should your system not have gmake installed or available
	  as simple installation package, please see <a class="ulink" href="www.gnu.org/software/make/" target="_top">www.gnu.org/software/make/</a> for more information regarding
	  GNU make.
	</li><li class="listitem">
	  GNU flex &#8805; 2.5.33. Should your system not have flex installed or
	  available as simple installation package, please see <a class="ulink" href="http://flex.sourceforge.net/" target="_top">http://flex.sourceforge.net/</a> for more information regarding
	  flex.
	</li><li class="listitem">
	  Expat library &#8805; 2.0.1. Should your system not have the Expat library and
	  header files already installed or available as simple installation
	  package, you will need to download and install a yourself. Please see
	  <a class="ulink" href="http://www.libexpat.org/" target="_top">http://www.libexpat.org/</a> and <a class="ulink" href="http://sourceforge.net/projects/expat/" target="_top">http://sourceforge.net/projects/expat/</a> for information on how
	  to do this.
	</li><li class="listitem">
	  xxd. A small utility from the <span class="command"><strong>vim</strong></span> package.
	</li></ul></div><p>
	For <span class="emphasis"><em>building the documentation</em></span>, additional
	prerequisites are from the DocBook tool chain:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem">
	  xsltproc + docbook-xsl for HTML output
	</li><li class="listitem">
	  dblatex for PDF output
	</li></ul></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top"><p>
	  Previous versions of MIRA had a benefit by using the TCMalloc
	  library. This is not the case anymore! Indeed, tests showed that when
	  using TCMalloc, MIRA 4.9.x and above will probably need 20 to
	  30% <span class="emphasis"><em>more</em></span> max memory and up to 80% more overall
	  memory than without TCMalloc.
	</p><p>
	  In short: do not use at the moment.
	</p></td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_install_comp_comp"></a>2.4.2. 
	Compiling and installing
      </h3></div></div></div><p>
	MIRA uses the GNU autoconf/automake tools, please read the section
	"Basic Installation" of the <code class="filename">INSTALL</code> file in the
	source package of MIRA for more generic information on how to invoke
	them.
      </p><p>
	The short version: simply type
      </p><pre class="screen">
<code class="prompt">arcadia:/path/to/mira-4.0.0$</code> <strong class="userinput"><code>./configure</code></strong>
<code class="prompt">arcadia:/path/to/mira-4.0.0$</code> <strong class="userinput"><code>make</code></strong>
<code class="prompt">arcadia:/path/to/mira-4.0.0$</code> <strong class="userinput"><code>make install</code></strong></pre><p>
	This should install the following programs:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><span class="command"><strong>mira</strong></span></li><li class="listitem"><span class="command"><strong>miraconvert</strong></span></li><li class="listitem"><span class="command"><strong>mirabait</strong></span></li><li class="listitem"><span class="command"><strong>miramem</strong></span></li></ul></div><p>
	Should the <code class="literal">./configure</code> step fail for some reason or
	another, you should get a message telling you at which step this
	happens and and either install missing packages or tell
	<span class="command"><strong>configure</strong></span> where it should search the packages it
	did not find, see also next section.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_install_comp_conf"></a>2.4.3. 
	Configure switches for MIRA
      </h3></div></div></div><p>
	MIRA understands all standard autoconf configure switches like <code class="literal">--prefix=</code>
	etc. Please consult the INSTALL file in the MIRA top level directory
	of the source package and also call <code class="literal">./configure
	--help</code> to get a full list of currently supported switches.
      </p><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_install_comp_conf_boost"></a>2.4.3.1. 
	  BOOST configure switches for MIRA
	</h4></div></div></div><p>
	  BOOST is maybe the most tricky library to get right in case it does
	  not come pre-configured for your system. The two main switches for
	  helping to locate BOOST are
	  probably <code class="literal">--with-boost=[ARG]</code>
	  and <code class="literal">--with-boost-libdir=LIB_DIR</code>. Only if those
	  two fail, try using the other <code class="literal">--with-boost-*=</code> switches
	  you will see from the ./configure help text.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_install_comp_conf_mira"></a>2.4.3.2. 
	  MIRA specific configure switches
	</h4></div></div></div><p>
	  MIRA honours the following switches:
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      --enable-64=yes/no
	    </span></dt><dd><p>
		MIRA should happily build as 32 bit executable on 32 bit
		platforms and as 64 bit executable on 64 bit platforms. On 64
		bit platforms, setting the switch to 'no' forces the compiler
		to produce 32 bit executables (if possible)
	      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		As of MIRA 3.9.0, support for 32 bit platforms is being
		slowly phased out. While MIRA should compile and also run fine
		on 32 bit platforms, I do not guarantee it anymore as I
		haven't used 32 bit systems in the last 5 years.
	      </td></tr></table></div></dd><dt><span class="term">
	      --enable-warnings
	    </span></dt><dd>
	      Enables compiler warnings, useful only for developers, not for users.
	    </dd><dt><span class="term">
	      --enable-debug
	    </span></dt><dd>
	      Lets the MIRA binary contain C/C++ debug symbols.
	    </dd><dt><span class="term">
	      --enable-mirastatic
	    </span></dt><dd>
	      Builds static binaries which are easier to distribute. Some
	      platforms (like OpenSolaris) might not like this and you will
	      get an error from the linker.
	    </dd><dt><span class="term">
	      --enable-optimisations
	    </span></dt><dd>
	      Instructs the configure script to set optimisation switches for compiling
	      (on by default). Switching optimisations off (warning, high impact on
	      run-time) might be interesting only for, e.g, debugging with valgrind.
	    </dd><dt><span class="term">
	      --enable-publicquietmira
	    </span></dt><dd>
	      Some parts of MIRA can dump additional debug information during
	      assembly, setting this switch to "no" performs this. Warning:
	      MIRA will be a bit chatty, using this is not recommended for
	      public usage.
	    </dd><dt><span class="term">
	      --enable-developmentversion
	    </span></dt><dd>
	      Using MIRA with enabled development mode may lead to extra
	      output on stdout as well as some additional data in the results
	      which should not appear in real world data
	    </dd><dt><span class="term">
	      --enable-boundtracking
	    </span></dt><dd></dd><dt><span class="term">
	      --enable-bugtracking
	    </span></dt><dd>
	      Both flags above compile in some basic checks into mira that
	      look for sanity within some functions: Leaving this on "yes"
	      (default) is encouraged, impact on run time is minimal
	    </dd><dt><span class="term">
	    </span></dt><dd></dd><dt><span class="term">
	    </span></dt><dd></dd><dt><span class="term">
	    </span></dt><dd></dd></dl></div></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_install_walkthroughs"></a>2.5. 
      Installation walkthroughs
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_install_walkthroughs_kubuntu"></a>2.5.1. 
	(K)Ubuntu 12.04
      </h3></div></div></div><p>
	You will need to install a couple of tools and libraries before
	compiling MIRA. Here's the recipe:
      </p><pre class="screen">
<strong class="userinput"><code>sudo apt-get install make flex
sudo apt-get install libboost-doc libboost.*1.48-dev libboost.*1.48.0</code></strong></pre><p>
	Once this is done, you can unpack and compile MIRA. For a dynamically
	linked version, use:
      </p><pre class="screen">
<strong class="userinput"><code>tar xvjf <em class="replaceable"><code>mira-4.0.0.tar.bz2</code></em>
cd <em class="replaceable"><code>mira-4.0.0</code></em>
./configure
make &amp;&amp; make install</code></strong></pre><p>
	For a statically linked version, just change the configure line from
	above into
      </p><pre class="screen">
<strong class="userinput"><code>./configure <em class="replaceable"><code>--enable-mirastatic</code></em></code></strong></pre><p>
	In case you also want to build documentation yourself, you will need
	this in addition:
      </p><pre class="screen"><strong class="userinput"><code>sudo apt-get install xsltproc docbook-xsl dblatex</code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	  People working on git checkouts of the MIRA source code will
	  obviously need some more tools. Get them with this:
	</p><pre class="screen"><strong class="userinput"><code>sudo apt-get install automake libtool xutils-dev</code></strong></pre></td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_install_walkthroughs_opensuse"></a>2.5.2. 
	openSUSE 12.1
      </h3></div></div></div><p>
	You will need to install a couple of tools and libraries before
	compiling MIRA. Here's the recipe:
      </p><pre class="screen">
<strong class="userinput"><code>sudo zypper install gcc-c++ boost-devel
sudo zypper install flex libexpat-devel zlib-devel</code></strong></pre><p>
	Once this is done, you can unpack and compile MIRA. For a dynamically
	linked version, use:
      </p><pre class="screen">
<strong class="userinput"><code>tar xvjf <em class="replaceable"><code>mira-4.0.0.tar.bz2</code></em>
cd <em class="replaceable"><code>mira-4.0.0</code></em>
./configure
make &amp;&amp; make install</code></strong></pre><p>
	In case you also want to build documentation yourself, you will need
	this in addition:
      </p><pre class="screen"><strong class="userinput"><code>sudo zypper install docbook-xsl-stylesheets dblatex</code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	  People working on git checkouts of the MIRA source code will
	  obviously need some more tools. Get them with this:
	</p><pre class="screen"><strong class="userinput"><code>sudo zypper install automake libtool xutils-dev</code></strong></pre></td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_install_walkthroughs_fedora"></a>2.5.3. 
	Fedora 17
      </h3></div></div></div><p>
	You will need to install a couple of tools and libraries before
	compiling MIRA. Here's the recipe:
      </p><pre class="screen">
<strong class="userinput"><code>sudo yum -y install gcc-c++ boost-devel
sudo yum install flex expat-devel vim-common zlib-devel</code></strong></pre><p>
	Once this is done, you can unpack and compile MIRA. For a dynamically
	linked version, use:
      </p><pre class="screen">
<strong class="userinput"><code>tar xvjf <em class="replaceable"><code>mira-4.0.0.tar.bz2</code></em>
cd <em class="replaceable"><code>mira-4.0.0</code></em>
./configure
make &amp;&amp; make install</code></strong></pre><p>
	In case you also want to build documentation yourself, you will need
	this in addition:
      </p><pre class="screen"><strong class="userinput"><code>sudo yum -y install docbook-xsl dblatex</code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	  People working on git checkouts of the MIRA source code will
	  obviously need some more tools. Get them with this:
	</p><pre class="screen"><strong class="userinput"><code>sudo yum -y install automake libtool xorg-x1-util-devel</code></strong></pre></td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_install_walkthroughs_allfromscratch"></a>2.5.4. 
	Compile everything from scratch
      </h3></div></div></div><p>
	This lets you build a self-contained static MIRA binary. The only
	prerequisite here is that you have a working <span class="command"><strong>gcc</strong></span>
	&#8805; 4.6.2. Please download all necessary files (expat, flex, etc.pp)
	and then simply follow the script below. The only things that you will
	want to change are the path used and, maybe, the name of some packages
	in case they were bumped up a version or revision.
      </p><p>
	Contributed by Sven Klages.
      </p><pre class="screen">
## whatever path is appropriate
<strong class="userinput"><code>cd <em class="replaceable"><code>/home/gls/SvenTemp/install</code></em></code></strong>

## expat
<strong class="userinput"><code>tar zxvf <em class="replaceable"><code>expat-2.0.1.tar.gz</code></em>
cd <em class="replaceable"><code>expat-2.0.1</code></em>
./configure <em class="replaceable"><code>--prefix=/home/gls/SvenTemp/expat</code></em>
make &amp;&amp; make install</code></strong>

## flex
<strong class="userinput"><code>cd <em class="replaceable"><code>/home/gls/SvenTemp/install</code></em>
tar zxvf <em class="replaceable"><code>flex-2.5.35.tar.gz</code></em>
cd <em class="replaceable"><code>flex-2.5.35</code></em>
./configure <em class="replaceable"><code>--prefix=/home/gls/SvenTemp/flex</code></em>
make &amp;&amp; make install
cd <em class="replaceable"><code>/home/gls/SvenTemp/flex/bin</code></em>
ln -s flex flex++
export PATH=<em class="replaceable"><code>/home/gls/SvenTemp/flex/bin</code></em>:$PATH</code></strong>

## boost
<strong class="userinput"><code>cd <em class="replaceable"><code>/home/gls/SvenTemp/install</code></em>
tar zxvf <em class="replaceable"><code>boost_1_48_0.tar.gz</code></em>
cd <em class="replaceable"><code>boost_1_48_0</code></em>
./bootstrap.sh --prefix=<em class="replaceable"><code>/home/gls/SvenTemp/boost</code></em>
./b2 install</code></strong>

## mira itself
<strong class="userinput"><code>export CXXFLAGS="-I<em class="replaceable"><code>/home/gls/SvenTemp/flex/include</code></em>"

cd <em class="replaceable"><code>/home/gls/SvenTemp/install</code></em>
tar zxvf <em class="replaceable"><code>mira-3.4.0.1.tar.gz</code></em>
cd <em class="replaceable"><code>mira-3.4.0.1</code></em>
./configure --prefix=<em class="replaceable"><code>/home/gls/SvenTemp/mira</code></em> \
--with-boost=<em class="replaceable"><code>/home/gls/SvenTemp/boost</code></em> \
--with-expat=<em class="replaceable"><code>/home/gls/SvenTemp/expat</code></em> \
--enable-mirastatic
make &amp;&amp; make install</code></strong></pre></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_install_walkthroughs_dynamic"></a>2.5.5. 
	Dynamically linked MIRA
      </h3></div></div></div><p>
	In case you do not want a static binary of MIRA, but a dynamically
	linked version, the following script by Robert Bruccoleri will give
	you an idea on how to do this.
      </p><p>
	Note that he, having root rights, puts all additional software in
	/usr/local, and in particular, he keeps updated versions of Boost and
	Flex there.
      </p><pre class="screen">
#!/bin/sh -x

make distclean
oze=`find . -name "*.o" -print`
if [[ -n "$oze" ]]
then
   echo "Not clean."
   exit 1

fi

export prefix=${BUILD_PREFIX:-/usr/local}
export LDFLAGS="-Wl,-rpath,$prefix/lib"

./configure --prefix=$prefix \
           --enable-debug=yes \
           --enable-mirastatic=no \
           --with-boost-libdir=$prefix/lib \
           --enable-optimisations \
           --enable-boundtracking=yes \
           --enable-bugtracking=yes \
           --enable-extendedbugtracking=no
make
make install</pre></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_install_hintotherplatforms"></a>2.6. 
      Compilation hints for other platforms.
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_install_hintosx"></a>2.6.1. 
	Mac OS X
      </h3></div></div></div><p>
	This has been tested on OSX 10.6.4. You will need XCode (from Apple)
	and some packages from MacPorts.
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem">Download and install a current XCode</li><li class="listitem">Download and compile a current GCC (&#8805; 4.8.2). Do NOT use a
	GCC from MacPorts, it lacks a vitally important library
	(<code class="filename">libstdc++.a</code>)</li><li class="listitem">Download, compile with GCC and install a current BOOST
	library</li><li class="listitem">Download, compile with GCC and install all libraries MIRA
	needs (flex, etc.pp). Follow the directions given in <a class="xref" href="#sect_install_walkthroughs_allfromscratch" title="2.5.4.  Compile everything from scratch">Section 2.5.4: &#8220;
	Compile everything from scratch
      &#8221;</a> and <a class="xref" href="#sect_install_walkthroughs_allfromscratch" title="2.5.4.  Compile everything from scratch">Section 2.5.4, &#8220;
	Compile everything from scratch
      &#8221;</a></li><li class="listitem">Download the MIRA source package and unpack it</li><li class="listitem"><p>
	    In the unpacked MIRA directory, create a directory called
	    <code class="filename">OSXstatlibs</code>. Into this directory, you need to
	    softlink all needed static libraries from GCC, BOOST, flex,
	    etc.pp.
	  </p><p>
	    E.g., I have GCC installed in
	    <code class="filename">/opt/localwgcc48/</code> and therefore I need to use
	    the following to link GCC static libraries:
	  </p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>ln -s <em class="replaceable"><code>/opt/localwgcc48/lib/*a</code></em> .</code></strong></pre><p>
	    I have all the other libraries (BOOST, flex, etc.pp) installed in
	    <code class="filename">/opt/biosw/</code>, therefore I also need to link
	    these libraries:
	  </p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>ln -s <em class="replaceable"><code>/opt/biosw/lib/*a</code></em> .</code></strong></pre></li><li class="listitem"><p>
	    Run <code class="literal">./configure --enable-mirastatic ...</code> where
	    "..." stands for additional configure parameters needed and then
	    run <code class="literal">make</code>.
	  </p></li></ol></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	As of now (April 2014), if you are on OSX 10.9 (Mavericks) and are
	using GCC &#8804; 4.8.2, the steps described above
	may not be enough. If an error occurs at the linking stage very late in
	the MIRA building process, you need to patch a system file as described
	in <a class="ulink" href="http://trac.macports.org/ticket/41033" target="_top">http://trac.macports.org/ticket/41033</a></td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	A checkout from git needs some packages from MacPorts:
	<pre class="screen"><code class="prompt">$</code> <strong class="userinput"><code>port install autoconf automake libtool</code></strong></pre></td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	  Building documentation needs the packages 'libxslt' and 'dblatex' from MacPorts.
	</p><pre class="screen"><code class="prompt">$</code> <strong class="userinput"><code>port install dblatex libxslt</code></strong></pre><p>
	  (Feb 2014) The above may fail while installing the one or other
	  dependency (for me it was while installing 'urw-fonts'). If that is
	  the case, repeat a couple of times and normally it should work.
	</p></td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_install_hintnetbsd5"></a>2.6.2. 
	NetBSD 5 (i386)
      </h3></div></div></div><p>
	Contributed by Thomas Vaughan
      </p><p>
	The system flex <span class="emphasis"><em>(/usr/bin/flex)</em></span> is too old, but the
	devel/flex package from a recent pkgsrc works fine. BSD make doesn't
	like one of the lines in <span class="emphasis"><em>src/progs/Makefile</em></span>, so use GNU make instead
	(available from <span class="emphasis"><em>pkgsrc</em></span> as <span class="emphasis"><em>devel/gmake</em></span>). Other relevant pkgsrc packages:
	<span class="emphasis"><em>devel/boost-libs</em></span>, <span class="emphasis"><em>devel/boost-headers</em></span>
	and <span class="emphasis"><em>textproc/expat</em></span>. The configure script has to
	be told about these pkgsrc prerequisites (they are usually rooted
	at <span class="emphasis"><em>/usr/pkg</em></span> but other locations are possible):
      </p><pre class="screen"><strong class="userinput"><code>FLEX=/usr/pkg/bin/flex ./configure --with-expat=/usr/pkg --with-boost=/usr/pkg</code></strong></pre><p>
	If attempting to build a pkgsrc package of MIRA, note that the LDFLAGS
	passed by the pkgsrc mk files don't remove the need for
	the <span class="emphasis"><em>--with-boost</em></span> option.  The configure script
	complains about flex being too old, but this is harmless because it
	honours the $FLEX variable when writing out makefiles.
      </p></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_reference"></a>Chapter 3. MIRA 4 reference manual</h1></div><div><h3 class="subtitle"><i>aka: The extended man page of MIRA 4,
  a genome and EST/RNASeq sequence assembly and mapping program for Sanger, 454, IonTorrent,
  PacBio and Illumina/Solexa sequencing data</i></h3></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_ref_synopsis">3.1. 
      Synopsis
    </a></span></dt><dt><span class="sect1"><a href="#sect_ref_requirements">3.2. 
      Requirements
    </a></span></dt><dt><span class="sect1"><a href="#sect_ref_working_modes">3.3. 
      Working modes
    </a></span></dt><dt><span class="sect1"><a href="#sect_ref_config">3.4. 
      Configuring an assembly: files and parameters
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_ref_manifest_introduction">3.4.1. 
	The manifest file: introduction
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_manifest_basics">3.4.2. 
	The manifest file: basics
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_manifest_readgroups">3.4.3. 
	The manifest file: information on the data you have
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_readgroup">3.4.3.1. 
	Starting a new readgroup
      </a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_data">3.4.3.2. 
	Defining data files to load
      </a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_defaultqual">3.4.3.3. 
	Setting default quality
      </a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_technology">3.4.3.4. 
	  Defining technology used to sequence
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_nostatistics">3.4.3.5. 
	  Preventing statistics for technologies with biases
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_asreference">3.4.3.6. 
	Setting reference sequence for mapping jobs
      </a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_autopairing">3.4.3.7. 
	Autopairing: letting MIRA find out pair info by itself
      </a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_templatesize">3.4.3.8. 
	Setting size of read templates
      </a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_segplace">3.4.3.9. 
	Read segment placement
      </a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_segname">3.4.3.10. 
	Read segment naming
      </a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_strainname">3.4.3.11. 
	Strain naming
      </a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_datadirscf">3.4.3.12. 
	Data directory for SCF files
      </a></span></dt><dt><span class="sect3"><a href="#sect_ref_manifest_readgroups_renameprefix">3.4.3.13. 
	Renaming read name prefixes
      </a></span></dt></dl></dd><dt><span class="sect2"><a href="#sect_ref_manifest_parameters">3.4.4. 
	The manifest file: extended parameters
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_ref_parameter_groups">3.4.4.1. 
	  Parameter groups
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_technology_sections">3.4.4.2. 
	  Technology sections
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_parameter_shortnames">3.4.4.3. 
	  Parameter short names
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_order_dependent_quick_switches">3.4.4.4. 
	  Order dependent quick switches
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_general_ge">3.4.4.5. 
	  Parameter group: -GENERAL (-GE)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_assembly_as">3.4.4.6. 
	  Parameter group: -ASSEMBLY (-AS)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_strain_backbone_sb">3.4.4.7. 
	  Parameter group: -STRAIN/BACKBONE (-SB)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_dataprocessing_dp">3.4.4.8. 
	  Parameter group: -DATAPROCESSING (-DP)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_clipping_cl">3.4.4.9. 
	  Parameter group: -CLIPPING (-CL)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_skim_sk">3.4.4.10. 
	  Parameter group: -SKIM (-SK)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_hashstatistics_hs">3.4.4.11. 
	  Parameter group: -KMERSTATISTICS (-KS)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_align_al">3.4.4.12. 
	  Parameter group: -ALIGN (-AL)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_contig_co">3.4.4.13. 
	  Parameter group: -CONTIG (-CO)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_edit_ed">3.4.4.14. 
	  Parameter group: -EDIT (-ED)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_misc_mi">3.4.4.15. 
	  Parameter group: -MISC (-MI)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_misc_nw">3.4.4.16. 
	  Parameter group: -NAG_AND_WARN (-NW)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_directory_dir_di">3.4.4.17. 
	  Parameter group: -DIRECTORY (-DIR, -DI)
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_output_out">3.4.4.18. 
	  Parameter group: -OUTPUT (-OUT)
	</a></span></dt></dl></dd></dl></dd><dt><span class="sect1"><a href="#sect_ref_resuming_assemblies">3.5. 
      Resuming / restarting assemblies
    </a></span></dt><dt><span class="sect1"><a href="#sect_ref_input_output">3.6. 
      Input / Output
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_ref_directories">3.6.1. 
	Directories
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_filenames">3.6.2. 
	Filenames
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_ref_output">3.6.2.1. 
	  Output
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_assembly_statistics_and_information_files">3.6.2.2. 
	  Assembly statistics and information files
	</a></span></dt></dl></dd><dt><span class="sect2"><a href="#sect_ref_file_formats">3.6.3. 
	File formats
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_stdout_stderr">3.6.4. 
	STDOUT/STDERR
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_ssaha2smalt">3.6.5. 
	SSAHA2 / SMALT ancillary data
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_xml_traceinfo">3.6.6. 
	XML TRACEINFO ancillary data
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_contig_naming">3.6.7. 
	Contig naming
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_recovering_strain_specific_consensus">3.6.8. 
	Recovering strain specific consensus as FASTA
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_ref_tags_used_in_the_assembly_by_mira_and_edit">3.7. 
      Tags used in the assembly by MIRA and EdIt
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_ref_tags_read_and_used">3.7.1. 
	Tags read (and used)
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_tags_set_and_used">3.7.2. 
	Tags set (and used)
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_ref_contigs_singlets_debris">3.8. 
      Where reads end up: contigs, singlets, debris
    </a></span></dt><dt><span class="sect1"><a href="#sect_ref_snp_discovery">3.9. 
      Detection of bases distinguishing non-perfect repeats and SNP discovery
    </a></span></dt><dt><span class="sect1"><a href="#sect_ref_data_reduction">3.10. 
      Data reduction: subsampling vs. lossless digital normalisation
    </a></span></dt><dt><span class="sect1"><a href="#sect_ref_caveats">3.11. 
      Caveats
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_ref_using_artificial_reads">3.11.1. 
	Using data not from sequencing instruments: artificial / synthetic reads
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_ploidy_and_repeats">3.11.2. 
	Ploidy and repeats
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_handling_of_repeats">3.11.3. 
	Handling of repeats
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_ref_uniform_read_distribution">3.11.3.1. 
	  Uniform read distribution
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_keeping_'long'_repetitive_contigs_separate">3.11.3.2. 
	  Keeping 'long' repetitive contigs separate
	</a></span></dt><dt><span class="sect3"><a href="#sect_ref_helping_finishing_by_tagging_reads_with_haf_tags">3.11.3.3. 
	  Helping finishing by tagging reads with HAF tags
	</a></span></dt></dl></dd><dt><span class="sect2"><a href="#sect_ref_consensus_in_finishing_programs_gap4_consed_">3.11.4. 
	Consensus in finishing programs (gap4, consed, ...)
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_some_other_things_to_consider">3.11.5. 
	Some other things to consider
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_ref_things_you_should_not_do">3.12. 
      Things you should not do
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_ref_never_on_nfs">3.12.1. 
	Do not run MIRA on NFS mounted directories without redirecting the tmp directory
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_never_without_quality_values">3.12.2. 
	Do not assemble without quality values
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_ref_useful_third_party_programs">3.13. 
      Useful third party programs
    </a></span></dt><dt><span class="sect1"><a href="#sect_ref_speed_and_memory_considerations">3.14. 
      Speed and memory considerations
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_ref_memory">3.14.1. 
	Estimating needed memory for an assembly project
      </a></span></dt><dt><span class="sect2"><a href="#sect_ref_speed">3.14.2. 
	Some numbers on speed
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_ref_known_problems_bugs">3.15. 
      Known Problems / Bugs
    </a></span></dt><dt><span class="sect1"><a href="#sect_ref_todos">3.16. 
      TODOs
    </a></span></dt><dt><span class="sect1"><a href="#sect_ref_working_principles">3.17. 
      Working principles
    </a></span></dt><dt><span class="sect1"><a href="#sect_ref_see_also">3.18. 
      See Also
    </a></span></dt></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">The manual only makes sense after you learn the program.
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_synopsis"></a>3.1. 
      Synopsis
    </h2></div></div></div><p>
      <code class="literal">mira [-chmMrtv] <em class="replaceable"><code>manifest-file</code></em>  [<em class="replaceable"><code>manifest-file</code></em> ...]</code>
    </p><p>
      The command line parameters in short:
    </p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	  [-c / --cwd=<em class="replaceable"><code>directory</code></em>]
	</span></dt><dd>
	  Change working directory.
	</dd><dt><span class="term">
	  [-h / --help]
	</span></dt><dd>
	  Print a short help and exit.
	</dd><dt><span class="term">
	  [-m / --mcheck]
	</span></dt><dd>
	  Only check the manifest file, then exit.
	</dd><dt><span class="term">
	  [-M / --mdcheck]
	</span></dt><dd>
	  Only check the manifest file and presence of data files, then exit.
	</dd><dt><span class="term">
	  [-r / --resume]
	</span></dt><dd>
	  Resume / restart an interrupted assembly. Works only for de-novo
	  assemblies at the moment.
	</dd><dt><span class="term">
	  [-t / --thread=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	</span></dt><dd>
	  Force number of threads (overrides equivalent [-GE:not]
          manifest entry).
	</dd><dt><span class="term">
	  [-v / --version]
	</span></dt><dd>
	  Print version and exit.
	</dd></dl></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_requirements"></a>3.2. 
      Requirements
    </h2></div></div></div><p>
      To use MIRA itself, one doesn't need very much:
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	  Sequence data in EXP, CAF, PHD, FASTA or FASTQ format
	</p></li><li class="listitem"><p>
	  Optionally: ancillary information in NCBI traceinfo XML format;
	  ancillary information about strains in tab delimited format, vector
	  screen information generated with <span class="command"><strong>ssaha2</strong></span> or
	  <span class="command"><strong>smalt</strong></span>.
	</p></li><li class="listitem"><p>
	  Some memory and disk space. Actually lots of both if you are
	  venturing into 454 or Illumina.
	</p></li></ul></div><p>
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_working_modes"></a>3.3. 
      Working modes
    </h2></div></div></div><p>
      MIRA has three basic working modes: genome, EST/RNASeq or
      EST-reconstruction-and-SNP-detection. From version 2.4 on, there is
      only executable which supports all modes. The name with which this
      executable is called defines the working mode:
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  <span class="command"><strong>mira</strong></span> for assembly of genomic data as well as
	  assembly of EST data from one or multiple strains / organisms
	</p><p>
	  and
	</p></li><li class="listitem"><p>
	  <span class="command"><strong>miraSearchESTSNPs</strong></span> for assembly of EST data from
	  different strains (or organisms) and SNP detection within this
	  assembly. This is the former <span class="command"><strong>miraEST</strong></span> program
	  which was renamed as many people got confused regarding whether to
	  use MIRA in est mode or miraEST.
	</p></li></ol></div><p>
      Note that <span class="command"><strong>miraSearchESTSNPs</strong></span> is usually realised as
      a link to the <span class="command"><strong>mira</strong></span> executable, the executable
      decides by the name it was called with which module to start.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_config"></a>3.4. 
      Configuring an assembly: files and parameters
    </h2></div></div></div><p>
      All the configuration needed for an assembly is done in one (or several)
      configuration file(s): the <span class="emphasis"><em>manifest</em></span> files. This
      encompasses things like what kind of assembly you want to perform
      (genome or EST / RNASeq, mapping or de-novo etc.pp) or which data files
      contain the sequences you want to assemble (and in which format these
      are).
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_manifest_introduction"></a>3.4.1. 
	The manifest file: introduction
      </h3></div></div></div><p>
	A <span class="emphasis"><em>manifest</em></span> file can be seen as a two part
	configuration file for an assembly: the first part contains some
	general information while the second part contains information about
	the sequencing data to be loaded. Examples being always easier to
	follow than long texts, here's an example for a de-novo assembly with
	single-end (also called shotgun) 454 data:
      </p><pre class="screen"># Example for a manifest describing a simple 454 de-novo assembly

# A manifest file can contain comment lines, these start with the #-character

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should assemble a genome de-novo in accurate mode
# As special parameter, we want to use 4 threads in parallel (where possible)

<strong class="userinput"><code>
project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,denovo,accurate</code></em>
parameters = <em class="replaceable"><code>-GE:not=4</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups": this reflects the
#  ... that read sequences ...

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>SomeUnpaired454ReadsIGotFromTheLab</code></em>
data = <em class="replaceable"><code>TCMFS456ZH345.fastq TQF92GT7H34.fastq</code></em>
technology = <em class="replaceable"><code>454</code></em></code></strong></pre><p>
	To make things a bit more interesting, here's an example using a
	couple more technologies and showing some more options of the manifest
	file like wild cards in file names, different paired-end/mate-pair
	libraries and how to let MIRA refine pairing information (or even find
	out everything by itself):
      </p><pre class="screen"># Example for a manifest describing a de-novo assembly with
# unpaired 454, paired-end Illumina, a mate-pair Illumina
# and a paired Ion Torrent

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should assemble a genome de-novo in accurate mode
# As special parameter, we want to use 4 passes with kmer sizes of
# 17, 31, 63 and 127 nucleotides. Obviously, read lengths of the
# libraries should be greater than 127 bp.
# Note: usually MIRA will choose sensible options for number of
#  passes and kmer sizes to be used by itself.

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,denovo,accurate</code></em>
parameters = <em class="replaceable"><code>-AS:kms=17,31,63,127</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups": this reflects the
#  ... that read sequences ...

# defining the shotgun (i.e. unpaired) 454 reads
<strong class="userinput"><code>readgroup = <em class="replaceable"><code>SomeUnpaired454ReadsIGotFromTheLab</code></em>
data = <em class="replaceable"><code>TCMFS456ZH345.fastq TQF92GT7H34.fastq</code></em>
technology = <em class="replaceable"><code>454</code></em></code></strong>

# defining the paired-end Illumina reads, fixing all needed pair information
<strong class="userinput"><code>readgroup = <em class="replaceable"><code>SomePairedEndIlluminaReadsIGotFromTheLab</code></em>
data = <em class="replaceable"><code>datape*.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
template_size = <em class="replaceable"><code>100 300</code></em>
segment_placement = <em class="replaceable"><code>---&gt; &lt;---</code></em>
segment_naming = <em class="replaceable"><code>solexa</code></em></code></strong>

# defining the mate-pair Illumina reads, fixing most needed pair information
#  but letting MIRA refine the template_size via "autorefine"
<strong class="userinput"><code>readgroup = <em class="replaceable"><code>SomeMatePairIlluminaReadsIGotFromTheLab</code></em>
data = <em class="replaceable"><code>datamp*.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
template_size = <em class="replaceable"><code>2000 4000 autorefine</code></em>
segment_placement = <em class="replaceable"><code>&lt;--- ---&gt;</code></em>
segment_naming = <em class="replaceable"><code>solexa</code></em></code></strong>

# defining paired Ion Torrent reads
# example to show how lazy one can be and simply let MIRA estimate by itself
#  all needed pairing information via "autopairing"
#  Hint: it usually does a better job at it than we do ;-)
<strong class="userinput"><code>readgroup = <em class="replaceable"><code>SomePairedIonReadsIGotFromTheLab</code></em>
<em class="replaceable"><code>autopairing</code></em>
data = <em class="replaceable"><code>dataion*.fastq</code></em>
technology = <em class="replaceable"><code>iontor</code></em></code></strong></pre></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_manifest_basics"></a>3.4.2. 
	The manifest file: basics
      </h3></div></div></div><p>
	The first part of an assembly <span class="emphasis"><em>manifest</em></span> contains
	the very basic information the assembler needs to have to know what
	you want it to do. This part consists of exactly three entries:
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    <span class="bold"><strong>project =</strong></span> [=
	    <em class="replaceable"><code>project name</code></em>] tells the assembler
	    the name you wish to give to the whole assembly project. MIRA will
	    use that name throughout the whole assembly for naming
	    directories, files and a couple of other things.
	  </p><p>
	    You can name the assembly anyway you want, you should however
	    restrain yourself and use only alphanumeric characters and perhaps
	    the characters plus, minus and underscore. Using slashes or
	    backslashes here is a recipe for catastrophe.
	  </p></li><li class="listitem"><p>
	    <span class="bold"><strong>job =</strong></span>
	     [<em class="replaceable"><code>denovo|mapping</code></em>],
	     [<em class="replaceable"><code>genome|est|fragments</code></em>],
	     [<em class="replaceable"><code>draft|accurate</code></em>] tells the
	    assembler what kind of data it should expect and how it should
	    assemble it.
	  </p><p>
	  You need to make your choice mainly in three steps and in the end
	  concatenate your choices to the [job=] entry of the manifest:
	  </p><div class="orderedlist"><ol class="orderedlist" type="a"><li class="listitem"><p>
		are you building an assembly from scratch
		(choose: <span class="emphasis"><em>denovo</em></span>) or are you mapping reads
		to an existing backbone sequence
		(choose: <span class="emphasis"><em>mapping</em></span>)?  Pick one. Leaving this
		out automatically chooses <span class="emphasis"><em>denovo</em></span> as
		default.
	      </p></li><li class="listitem"><p>
		are the data you are assembling forming a larger contiguous
		sequence (choose: <span class="emphasis"><em>genome</em></span>) or are you
		assembling small fragments like in EST or mRNA libraries
		(choose: <span class="emphasis"><em>est</em></span>)? Pick one. Leaving this out
		automatically chooses <span class="emphasis"><em>genome</em></span> as default.
	      </p><p>
		Since version 4.9.4, a new mode <span class="emphasis"><em>fragments</em></span>
		is available. This mode is essentially similar to the
		<span class="emphasis"><em>EST</em></span> mode, but has all safety features
		switched off which reduce data sizes. Use this mode for
		assembly of comparatively small EST/mRNA projects where you
		want to have highest accuracy and minimal filtering. Warning:
		contigs with coverages going into the 1000s will lead to
		really slow assemblies.
	      </p></li><li class="listitem"><p>
		do you want a quick and dirty assembly for first insights
		(choose: <span class="emphasis"><em>draft</em></span>) or an assembly that should
		be able to tackle even most nasty cases (choose:
		<span class="emphasis"><em>accurate</em></span>)? Pick one. Leaving this out
		automatically chooses <span class="emphasis"><em>accurate</em></span> as default.
	      </p></li></ol></div><p>
	    Once you're done with your choices, concatenate everything with
	    commas and you're done. E.g.:
	    '<code class="literal">--job=mapping,genome,draft</code>' will give you a
	    mapping assembly of a genome in draft quality.
	  </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	    For de-novo assembly of genomes, these switches are optimised for
	    'decent' coverages that are commonly seen to get you something useful,
	    i.e., &#8805; 7x for Sanger, &gt;=18x for 454 FLX or Titanium, &#8805; 25x for
	    454 GS20 and &#8805; 30x for Solexa. Should you venture into lower
	    coverage or extremely high coverage (say, &gt;=60x for 454), you will
	    need to adapt a few parameters via extensive switches.
	  </td></tr></table></div></li><li class="listitem"><p>
	    <span class="bold"><strong>parameters =</strong></span> is used in case you
	    want to change one of the 150+ extended parameters MIRA has to
	    offer to control almost every aspect of an assembly. This is
	    described in more detail in a separate section below.
	  </p></li></ol></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_manifest_readgroups"></a>3.4.3. 
	The manifest file: information on the data you have
      </h3></div></div></div><p>
	The second part of an assembly <span class="emphasis"><em>manifest</em></span> tells
	MIRA which files it needs to load, which sequencing technology
	generated the data, whether there are DNA template constraints it can
	use during the assembly process and a couple of other things.
      </p><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_readgroup"></a>3.4.3.1. 
	Starting a new readgroup
      </h4></div></div></div><p>
	  <span class="bold"><strong>readgroup </strong></span> [= <em class="replaceable"><code>group name</code></em>] is the keyword which tells MIRA that you are going to define a new read group. You can optionally name that group.
	</p><div class="sidebar"><a name="sidebar_ref_manifest_readgroups_templates_and_readgroups"></a><div class="titlepage"><div><div><p class="title"><b>
	    Understanding readgroups and DNA templates
	  </b></p></div></div></div><p>
	    When you send away your DNA for sequencing, it is going to be
	    prepared for sequencing according to your wishes. Sequencing
	    providers call this "constructing a library" and regardless
	    whether you sequence with Sanger, 454, Illumina, Ion Torrent,
	    Pacific Biosciences or other technologies, the "library prep" is
	    always there.
	  </p><p>
	    With most library preps, your DNA is first amplified and then
	    cut into small pieces. These pieces are called
	    <span class="emphasis"><em>templates</em></span> and their length can be anywhere
	    between a few dozen bases, a few hundred bases or even a couple
	    of dozen or even hundred kilobases. The important thing is that
	    these templates can be much bigger in size than the actual read
	    length. While this is a wet lab step, protocols and providers
	    have gotten pretty good at constructing libraries where the DNA
	    templates are all in a given range of bases like, e.g., having a
	    library with template size 500bp (+/- 100bp) and another library
	    with template size around 7kb (+/- 500bp).
	  </p><p>
	    Depending on the technology and sequencing strategy used, the
	    DNA templates are used to create either one single read or - and
	    that's important - two or more reads.
	  </p><p>
	    Libraries with "single reads" are often called "single read
	    libraries" or "shotgun libraries". They can be found for every
	    sequencing technology and are most of the time easy to construct
	    (therefore cheap) and are often used to provide a decent amount
	    of bases as basic coverage for your project.
	  </p><p>
	    Libraries with two reads per DNA template are often called
	    "mate-pair" or "paired-end" libraries. They are harder to
	    construct and sometime have less yield, therefore they are often
	    more expensive. But the sequencing approach using several reads
	    per DNA template allows assembly and scaffolding algorithms to
	    resolve repetitive regions of a genome which are longer than the
	    average read length. Note that Pacific Biosciences has a
	    sequencing mode called "strobed sequencing" which is different
	    from "paired-end/mate-pair" but also creates multiple reads per
	    DNA template.
	  </p><p>
	    Long story short: an assembler must know afterwards what kind of
	    reads it has to expect: the sequencing technology, library
	    preparation strategy etc. For this, the notion of <span class="emphasis"><em>read
	    groups</em></span> has emerged: reads coming from the same
	    technology and same library preparation are pooled together in a
	    read group to tell the assembler: in the assembly, if you see two
	    reads coming from a same DNA template, you should expect them to
	    be at a certain distance from each other and they should be
	    oriented in a certain way.
	  </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	    The above was a <span class="bold"><strong>very</strong></span> simplified
	    view on the whole area of DNA templates, readgroups, shotgun and
	    paired end sequencing. Enough to hopefully understand the
	    concepts, but you might want to read more about it.
	  </td></tr></table></div></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_data"></a>3.4.3.2. 
	Defining data files to load
      </h4></div></div></div><p>
	  <span class="bold"><strong>data</strong></span> = <em class="replaceable"><code>filepath
	  [filepath ...]</code></em> defines the file paths from
	  which sequences should be loaded. A file path can contain just the
	  name of one (or several) files or it can contain the
	  <span class="emphasis"><em>path</em></span>, i.e., the directory (absolute or
	  relative) including the file name.
	</p><p>
	  MIRA automatically recognises what type the sequence data is by
	  looking at the postfix of files. For postfixes not adhering widely
	  used naming schemes for file types, there's additionally a way of
	  explicitly defining the type (see further down at the end of this
	  item on how this is done). Currently allowed file types are:
	</p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	      <code class="filename">.fasta</code> for sequences formatted in FASTA
	      format where there exists an additional
	      <code class="filename">.fasta.qual</code> file which contains quality
	      data. If the file with quality data is missing, this is
	      interpreted as error and MIRA will abort.
	    </p></li><li class="listitem"><p>
	      <code class="filename">.fna</code> and <code class="filename">.fa</code> also
	      for sequences formatted in FASTA format. The difference
	      to <code class="filename">.fasta</code> lies in the way MIRA treats a
	      missing quality file (called
	      <code class="filename">.fna.qual</code>
	      or <code class="filename">.fa.qual</code>): it does not see that as
	      critical error and continues.
	    </p></li><li class="listitem"><p>
	      <code class="filename">.fastq</code> or <code class="filename">.fq</code> for files in FASTQ format
	    </p></li><li class="listitem"><p>
	      <code class="filename">.gff3</code> or <code class="filename">.gff</code> for files in GFF3 format. Note that
	      MIRA will load all sequences and annotations contained in this
	      file.
	    </p></li><li class="listitem"><p>
	      <code class="filename">.gbk</code>, <code class="filename">.gbf</code>, <code class="filename">.gbff</code>
	      or <code class="filename">.gb</code> for files formatted in GenBank
	      format. Note that the MIRA GenBank loader does not understand
	      intron/exon or other multiple-locus structures in this format,
	      use GFF3 instead!
	    </p></li><li class="listitem"><p>
	      <code class="filename">.caf</code> for files in the CAF format (from Sanger Centre)
	    </p></li><li class="listitem"><p>
	      <code class="filename">.maf</code> for files in the MIRA MAF format
	    </p></li><li class="listitem"><p>
	      <code class="filename">.exp</code> for files in the Staden EXP format.
	    </p></li><li class="listitem"><p>
	      <code class="filename">.fofnexp</code> for a <span class="emphasis"><em>file of EXP
	      filenames</em></span> which all point to files in the Staden EXP
	      format.
	    </p></li><li class="listitem"><p>
	      <code class="filename">.xml</code>, <code class="filename">.ssaha2</code> and <code class="filename">.smalt</code> for ancillary data in NCBI TRACEINFO, SSAHA2 or SMALT format respectively.
	    </p></li></ul></div><p>
	  Multiple 'data' lines and multiple entries per line (even
	  different formats) are allowed, as in, e.g.,
	</p><pre class="screen">data = file1.fastq file2.fastq file3.fasta file4.gbk
data = myscreenings.smalt</pre><p>
	  You can also use wildcards and/or directory names. E.g., loading
	  all file types MIRA understand from a given directory
	  <code class="filename">mydir</code>:
	</p><pre class="screen">data = mydir</pre><p>
	  or loading all files starting with <code class="filename">mydata</code> and
	  ending with <code class="filename">fastq</code>:
	</p><pre class="screen">data = mydata*fastq</pre><p>
	  or loading all files in directory <code class="filename">mydir</code>
	  starting with <code class="filename">mydata</code> and ending with
	  <code class="filename">fastq</code>:
	</p><pre class="screen">data = mydir/mydata*fastq</pre><p>
	  or loading all FASTQ files in all directories starting with <code class="filename">mydir</code>:
	</p><pre class="screen">data = mydir*/*fastq</pre><p>
	  or ... well, you get the gist.
	</p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	  Giving a directory like in <code class="filename">mydir</code> is
	  equivalent to <code class="filename">mydir/*</code> (saying: give me all
	  files in the directory <code class="filename">mydir</code>), however the
	  first version should be preferred when the directory contains
	  thousands of files.
	</td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	    GenBank and GFF3 files may or may not contain embedded sequences. If
	    annotations are present in these files for which no sequence is
	    present in the same file, MIRA will look for reads of the same
	    name which it already loaded in this or previously defined read
	    groups and add the annotations there.
	  </p><p>
	    As security measure, annotations in GenBank and GFF3 files for which
	    absolutely no sequence or read has been defined are treated as
	    error.
	  </p></td></tr></table></div><p>
	  <span class="emphasis"><em>Explicit definition of file types.</em></span> It is
	  possible to explicitly tell MIRA the type of a file even if said
	  file does not have a 'standard' naming scheme. For this, the
	  EMBOSS double-colon notation has been adapted to work also for
	  MIRA, i.e., you prepend the type of a file and separate it from
	  the file name by a double colon. E.g.,
	  the <code class="filename">.dat</code> postfix is not anything MIRA will
	  recognise, but you can define it should be loaded as FASTQ file
	  like this:
	</p><pre class="screen">data = fastq::myfile.dat</pre><p>
	  Another frequent usage is forcing MIRA to load FASTA files
	  named <code class="filename">.fasta</code> without complaining in case
	  quality files (which MIRA wants you to provide) are not present:
	</p><pre class="screen">data = fna::myfile.fasta</pre><p>
	  This does (of course) work also with directories or wildcard
	  characters. In the following example, the first line will load all
	  files from <code class="filename">mydirectory</code> as FASTQ while the
	  second line loads just <code class="filename">.dat</code> files in a given
	  path as FASTA:
	</p><pre class="screen">data = fastq::mydirectory
data = fasta::/path/to/somewhere/*.dat</pre><p>
	  It is entirely possible (although not really sensible), to give
	  contradicting information to MIRA by using a different explicit
	  file type than one would guess from the standard postfix. In this
	  case, the explicit type takes precedence over the automatic
	  type. E.g.: to force MIRA to load a file as FASTA although it is
	  named <code class="filename">.fastq</code>, one could use this:
	</p><pre class="screen">data = fasta::file.fastq</pre><p>
	  Note that the above does not make any kind of file conversion,
	  <code class="filename">file.fastq</code> needs to be already in FASTA
	  format or else MIRA will fail loading that data.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_defaultqual"></a>3.4.3.3. 
	Setting default quality
      </h4></div></div></div><p>
	  <span class="bold"><strong>default_qual</strong></span>=
	  <em class="replaceable"><code>quality_value</code></em> is meant to be used as
	  default fall-back quality value for sequences where the data files
	  given above do not contain quality values. E.g., GFF3 or GenBank
	  formats, eventually also FASTA files where quality data files is
	  missing.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_technology"></a>3.4.3.4. 
	  Defining technology used to sequence
	</h4></div></div></div><p>
	  <span class="bold"><strong>technology</strong></span>=
	  <em class="replaceable"><code>technology</code></em> which names the technology
	  with which the sequences were produced. Allowed technologies are:
	  <span class="emphasis"><em>sanger, 454, solexa, iontor, pcbiolq, pcbiohq,
	  text</em></span>.
	</p><p>
	  The <span class="emphasis"><em>text</em></span> technology is not a technology per
	  se, but should be used for sequences which are not coming from
	  sequencing machines like, e.g., database entries, consensus
	  sequences, artificial reads (which do not comply to normal
	  behaviour of 'normal' sequencing data), etc.pp
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_nostatistics"></a>3.4.3.5. 
	  Preventing statistics for technologies with biases
	</h4></div></div></div><p>
	  <span class="bold"><strong>nostatistics</strong></span> used as keyword will
	  prevent MIRA to calculate coverage estimates from reads of the given
	  readgroup.
	</p><p>
	  This keyword should be used in denovo genome assemblies for reads
	  from libraries which produce very uneven coverage (e.g.: old
	  Illumina mate-pair protocols) or have a bias in the randomness of
	  DNA fragmentations (e.g.: Nextera protocol from Illumina).
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_asreference"></a>3.4.3.6. 
	Setting reference sequence for mapping jobs
      </h4></div></div></div><p>
	  <span class="bold"><strong>as_reference</strong></span> This keyword
	  indicates to MIRA that the sequences in this readgroup should not
	  be assembled, but should be used as reference backbone for a
	  mapping assembly. That is, sequencing reads are then placed/mapped
	  onto these reference reads.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_autopairing"></a>3.4.3.7. 
	Autopairing: letting MIRA find out pair info by itself
      </h4></div></div></div><p>
	  <span class="bold"><strong>autopairing</strong></span> This keyword is used
	  to tell MIRA it should estimate values for
	  <span class="emphasis"><em>template_size</em></span> and
	  <span class="emphasis"><em>segment_placement</em></span> (see below).
	</p><p>
	  This is basically the lazy way to tell MIRA that the data in the
	  corresponding readgroup consists of paired reads and that you
	  trust it will find out the correct values.
	</p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><span class="emphasis"><em>autopairing</em></span> usually works quite well for
	  small and mid-sized libraries (up to, say, 10 kb). For larger
	  libraries it might be a good thing to tell MIRA some rough
	  boundaries via <span class="emphasis"><em>template_size</em></span> /
	  <span class="emphasis"><em>segment_placement</em></span> and let MIRA refine the
	  values for the template size via <span class="emphasis"><em>autorefine</em></span>
	  (see below).
	</td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><span class="emphasis"><em>autopairing</em></span> is a feature new to MIRA 4.0rc5,
	  it may contain bugs for some corner cases. Feedback appreciated.
	</td></tr></table></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_templatesize"></a>3.4.3.8. 
	Setting size of read templates
      </h4></div></div></div><p>
	  <span class="bold"><strong>template_size </strong></span>=
	  <em class="replaceable"><code>min_size max_size
	  <span class="emphasis"><em>[infoonly|exclusion_criterion]</em></span>
	  <span class="emphasis"><em>[autorefine]</em></span></code></em>. Defines the
	  minimum and maximum size of "good" DNA templates in the library
	  prep for this read group. This defines at which distance the two
	  reads of a pair are to be expected in a contig, a very useful
	  information for an assembler to resolve repeats in a genome or
	  different splice variants in transcriptome data.
	</p><p>
	  If the term <span class="emphasis"><em>infoonly</em></span> is present, then MIRA
	  will pass the information on template sizes in result files, but
	  will not use it for any decision making during de-novo or mapping
	  assembly. The term <span class="emphasis"><em>exclusion_criterion</em></span> makes
	  MIRA use the information for decision making.
	</p><p>
	  If <span class="emphasis"><em>infoonly</em></span>
	  or <span class="emphasis"><em>exclusion_criterion</em></span> are missing, then MIRA
	  assumes <span class="emphasis"><em>exclusion_criterion</em></span> for de-novo
	  assemblies and <span class="emphasis"><em>infoonly</em></span> for mapping
	  assemblies.
	</p><p>
	  If the term <span class="emphasis"><em>autorefine</em></span> is present, MIRA will
	  start the assembly with the given size information but switch to
	  refined value computed from observed distances in an
	  assembly. However, please note that the size values
	  can <span class="emphasis"><em>never</em></span> be expanded, only shrunk. It is
	  therefore advisable to use generous bounds when using the
	  autorefine feature.
	</p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	  The <span class="emphasis"><em>template_size</em></span> line in the manifest file
	  replaces the parameters -GE:uti:tismin:tismax of earlier versions
	  of MIRA (3.4.x and below).
	</td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	  The minimum or the maximum size (or both) can be set to a negative
	  value for "don't care and don't check". This allows constructs
	  like <code class="literal">template_size= 500 -1 exclusion_criterion</code>
	  which would check only the minimum distance but not the maximum
	  distance.
	</td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	    For <span class="emphasis"><em>mapping</em></span> assemblies with MIRA, you
	    usually will want to use <span class="emphasis"><em>infoonly</em></span> as else -
	    in case of genome re-arrangements, larger deletions or
	    insertions - MIRA would probably reject one read of every read
	    pair in the corresponding areas as it would not be at the
	    expected distance and/or orientation ... and you would not be
	    able to simply find the re-arrangement in downstream analysis.
	  </p><p>
	    For <span class="emphasis"><em>de-novo</em></span> assemblies however
	    you <span class="emphasis"><em>should not</em></span>
	    use <span class="emphasis"><em>infoonly</em></span> except in very rare cases
	    where you know what you do.
	  </p></td></tr></table></div><div class="sidebar"><div class="titlepage"><div><div><p class="title"><b>
	    Understanding the size of DNA templates
	  </b></p></div></div></div><p>
	    When using a <span class="emphasis"><em>paired-end</em></span> or
	    <span class="emphasis"><em>mate-pair</em></span> sequencing strategy, two
	    sequences are generated for the ends of each DNA template (see
	    sidebar above: "understanding readgroups and DNA
	    templates"). That is, if one has a library with 6kb fragments,
	    one knows that the outer ends of the two reads will be
	    approximately 6kb apart, like so:
	  </p><pre class="screen">DNA template    ##############################################################
read 1          .......
read 2                                                                  ......
&lt;------------------------- ~6 kb ----------------------------&gt;</pre><p>
	    Sequencing labs will try their best to get these two sequences
	    from DNA templates which comply to a given length
	    specification. But as this is chemistry and wet lab, things must
	    be seen with a certain uncertainty and therefore the DNA
	    templates generated are not exactly of the specified size
	    (e.g. 6kb), but the size distribution will vary in a given
	    range, e.g., 5.5kb to 6.5 kb.
	  </p></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_segplace"></a>3.4.3.9. 
	Read segment placement
      </h4></div></div></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">You do not need to use this when using 'autopairing' (see above).</td></tr></table></div><p>
	  <span class="bold"><strong>segment_placement </strong></span>=
	  <em class="replaceable"><code>placementcode <span class="emphasis"><em>[infoonly|exclusion_criterion]</em></span></code></em>. Allowed
	  placement codes are:
	</p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	      <span class="bold"><strong>?</strong></span>
	      or <span class="bold"><strong>unknown</strong></span> which are
	      place-holders for "well, in the end: don't care." Segments of
	      a template can be reads in any direction and in any
	      relationship to each other.
	    </p><p>
	      This is typically used for unpaired libraries (sometimes
	      called <span class="emphasis"><em>shotgun libraries</em></span>), but may be
	      also useful for, e.g., primer walking with Sanger.
	    </p></li><li class="listitem"><p>
	      <span class="bold"><strong>---&gt; &lt;---</strong></span> or <span class="bold"><strong>FR</strong></span> or <span class="bold"><strong>INNIES</strong></span>. The <span class="emphasis"><em>forward /
	      reverse</em></span> scheme as used in traditional Sanger
	      sequencing as well as Illumina paired-end sequencing,
	    </p><p>
	      This is the usual placement code for Sanger paired-end
	      protocols as well as Illumina paired-end. Less frequently used
	      in IonTorrent paired-end sequencing.
	    </p></li><li class="listitem"><p>
	      <span class="bold"><strong>&lt;--- ---&gt;</strong></span> or <span class="bold"><strong>RF</strong></span> or <span class="bold"><strong>OUTIES</strong></span>. The <span class="emphasis"><em>reverse /
	      forward</em></span> scheme as used in Illumina mate-pair
	      sequencing.
	    </p><p>
	      This is the usual placement code for Illumina mate-pair protocols.
	    </p></li><li class="listitem"><p>
	      <span class="bold"><strong>1---&gt; 2---&gt;</strong></span> or
	      <span class="bold"><strong>samedir forward</strong></span> or <span class="bold"><strong>SF</strong></span> or <span class="bold"><strong>LEFTIES</strong></span>. The <span class="emphasis"><em>forward /
	      forward</em></span> scheme. Segments of a template are all
	      placed in the same direction, the segment order in the contig
	      follows segment ordering of the reads.
	    </p></li><li class="listitem"><p>
	      <span class="bold"><strong>2---&gt; 1---&gt;</strong></span> <span class="bold"><strong>samedir backward</strong></span> or <span class="bold"><strong>SB</strong></span> or <span class="bold"><strong>RIGHTIES</strong></span>. Segments of a template are
	      all placed in the same direction, the segment order in the
	      contig is reversed compared to segment ordering of the reads.
	    </p><p>
	      This is the usual placement code for 454 "paired-end" and IonTorrent
	      long-mate protocols.
	    </p></li><li class="listitem"><p>
	      <span class="bold"><strong>samedir</strong></span> Segments of a
	      template are all placed in the same direction, the spatial
	      relationship however is not cared of.
	    </p></li><li class="listitem"><p>
	      <span class="bold"><strong>&gt;&gt;&gt;</strong></span> (reserved for
	      sequencing of several equidistant fragments per template like
	      in PacBio strobe sequencing, not implemented yet)
	    </p></li></ul></div><p>
	  If the term <span class="emphasis"><em>infoonly</em></span> is present, then MIRA
	  will pass the information on segment placement in result files, but
	  will not use it for any decision making during de-novo assembly or
	  mapping assembly. The term <span class="emphasis"><em>exclusion_criterion</em></span> makes MIRA use the information for decision making.
	</p><p>
	  If <span class="emphasis"><em>infoonly</em></span> or <span class="emphasis"><em>exclusion_criterion</em></span> are missing, then MIRA assumes <span class="emphasis"><em>exclusion_criterion</em></span> for de-novo assemblies and <span class="emphasis"><em>infoonly</em></span> for mapping assemblies.
	</p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	    For <span class="emphasis"><em>mapping</em></span> assemblies with MIRA, you
	    usually will want to use <span class="emphasis"><em>infoonly</em></span> as else -
	    in case of genome re-arrangements, larger deletions or
	    insertions - MIRA would probably reject one read of every read
	    pair (as it would not be at the expected distance and/or
	    orientation) and you would not be able to simply find the
	    re-arrangement in downstream analysis.
	  </p><p>
	    For <span class="emphasis"><em>de-novo</em></span> assemblies however
	    you <span class="emphasis"><em>should not</em></span>
	    use <span class="emphasis"><em>infoonly</em></span> except in very rare cases
	    where you know what you do.
	  </p></td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	  As soon as you tell MIRA that a readgroup contains paired reads (via one of the other typical readgroup parameters like template_size, segment_naming etc.), the <span class="emphasis"><em>segment_placement</em></span> line becomes mandatory in the manifest. This is because different sequencing technologies and/or library preparations result in different read orientations. E.g., Illumina libraries come in paired-end flavour which have FR (forward/reverse) placements, but there are also mate-pair libraries which have reverse/forward (RF) placements.
	</td></tr></table></div><div class="sidebar"><div class="titlepage"><div><div><p class="title"><b>
	    Understanding read segment placement on DNA templates
	  </b></p></div></div></div><p>
	    bla
	  </p></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_segname"></a>3.4.3.10. 
	Read segment naming
      </h4></div></div></div><p>
	  <span class="bold"><strong>segment_naming </strong></span>= <em class="replaceable"><code>naming_scheme <span class="emphasis"><em>[rollcomment]</em></span></code></em>. Defines
	  the naming scheme reads are following to indicate the DNA template
	  they belong to. Allowed naming schemes are: <span class="emphasis"><em>sanger,
	  stlouis, tigr, FR, solexa, sra</em></span>.
	</p><p>
	  If not defined, the defaults are <span class="underline">sanger</span> for Sanger sequencing data,
	  while <span class="underline">solexa</span> for Solexa, 454
	  and Ion Torrent.
	</p><p>
	  For FASTQ files, the modifier <span class="emphasis"><em>rollcomment</em></span> can
	  be used to let MIRA take the first token in the comment as name of
	  a read instead of the orginal name. E.g.: for a read
	</p><pre class="screen">@DRR014327.1.1 HWUSI-EAS547_0013:1:1:1106:4597.1 length=91
TTAGAAGGAGATCTGGAGAACATTTTAAACCGGATTGAACAACGCGGCCGTGAGATGGAGCTTCAGACAAGCCGGTCTTATTGGGACGAAC
+
bbb`bbbbabbR`\_bb_bba`b`bb_bb_`\^\^Y^`\Zb^b``]]\S^a`]]a``bbbb_bbbb]bbb\`^^^]\aaY\`\\^aa__aB</pre><p>
	  the rollcomment modifier will lead to the read being named
	  <code class="filename"> HWUSI-EAS547_0013:1:1:1106:4597.1</code> (which
	  is almost the original instrument read name) instead of
	  <code class="filename">DRR014327.1.1</code> (which is the SRA read name).
	</p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	  For data from the short read archive (SRA), one will usually need
	  to explicitly specify the 'sra' naming scheme or use the
	  'rollcomment' modifier in FASTQ files.
	</td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	  This has changed with MIRA 3.9.1
	  and <span class="command"><strong>sff_extract</strong></span> 0.3.0. Before that, 454 and Ion
	  Torrent were given <span class="underline">fr</span> as naming
	  scheme.
	</td></tr></table></div><div class="sidebar"><div class="titlepage"><div><div><p class="title"><b>
	    Understanding read naming schemes
	  </b></p></div></div></div><p>
	    Read naming is a long story with lots of historical gotchas: it
	    needs to be clear and simple, but still people sometimes wanted
	    to convey additional meta-information with it. Unsurprisingly,
	    several "standards" emerged over time. In short: it's a mess. See also XKCD entry on <a class="ulink" href="http://xkcd.com/927/" target="_top">proliferating standards</a>.
	  </p><p>
	    How to choose: please read the documentation available at the
	    different centres or ask your sequence provider. In a nutshell
	    (and probably over-simplified):
	  </p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
		Sanger scheme
	      </span></dt><dd><p>
		  "somename<span class="emphasis"><em>.[pqsfrw][12][bckdeflmnpt][a|b|c|...</em></span>"
		  (e.g.  U13a08f10.p1ca), but the length of the postfix
		  must be at least 4 characters, i.e., ".p" alone will not
		  be recognised.
		</p><p>
		  Usually, ".p" + 3 characters or "f" + 3 characters are
		  used for forwards reads, while reverse complement reads
		  take either ".q" or ".r" (+ 3 characters in both cases).
		</p></dd><dt><span class="term">
		TIGR scheme
	      </span></dt><dd><p>
		  "somename<span class="emphasis"><em>TF*|TR*|TA*</em></span>"
		  (e.g. GCPBN02TF or GCPDL68TABRPT103A58B),
		</p><p>
		  Forward reads take "TF*", reverse reads "TR*".
		</p></dd><dt><span class="term">
		St. Louis scheme
	      </span></dt><dd><p>
		  "somename<span class="emphasis"><em>.[sfrxzyingtpedca]*</em></span>"
		</p></dd><dt><span class="term">
		Forward/Reverse scheme
	      </span></dt><dd><p>
		  "somename<span class="emphasis"><em>.[fr]*</em></span>"
		  (e.g.  E0K6C4E01DIGEW.f or E0K6C4E01BNDXN.r2nd),
		</p><p>
		  ".f*" for forward, ".r*" for reverse.
		</p></dd><dt><span class="term">
		Solexa scheme
	      </span></dt><dd><p>
		  Even simpler than the forward/reverse scheme, it allows
		  only for one two reads per template:
		  "somename<span class="emphasis"><em>/[12]</em></span>"
		</p></dd><dt><span class="term">
		SRA scheme
	      </span></dt><dd><p>
		  The Short Read Archive (SRA) finally settled on a naming
		  scheme and renames each and every read within its
		  database. When you download sequences from the archive,
		  all reads will be named
		  <code class="filename">XXX000000.Y[.Z]</code> (where X's are
		  characters A-Z, 0 are digits from 0 to 9, Y is a counter
		  and Z is a number denoting the segment (usually 1,2 or
		  3)). This naming scheme is applied to reads from all
		  technologies, therefore the MIRA technology dependent
		  defaults will not apply and one must specify the 'sra'
		  naming scheme in the command line.
		</p></dd></dl></div></div><p>
	  Any wildcard in the forward/reverse suffix must be consistent for
	  a read pair, and is treated as part of the template name. This is
	  to allow multiple sequencing of a fragment, particularly common
	  with Sanger capillary data (e.g. given somename.f and somename.r,
	  resequenced as somename.f2 and somename.r2, this would be treated
	  as two pairs, with template names somename and somename_2
	  respectively).
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_strainname"></a>3.4.3.11. 
	Strain naming
      </h4></div></div></div><p>
	  <span class="bold"><strong>strain_name </strong></span>=
	  <em class="replaceable"><code>string</code></em>. Defines the strain /
	  organism-code the reads of this read group are from. If not set,
	  MIRA will assign "StrainX" to normal readgroups and
	  "ReferenceStrain" to readgroups with reference sequences.
	</p><div class="sidebar"><div class="titlepage"><div><div><p class="title"><b>
	    Understanding how MIRA uses strain information
	  </b></p></div></div></div><p>
	    bla
	  </p></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_datadirscf"></a>3.4.3.12. 
	Data directory for SCF files
      </h4></div></div></div><p>
	  <span class="bold"><strong>datadir_scf </strong></span>=
	  <em class="replaceable"><code>directory</code></em>
	</p><p>
	  For SANGER data only: tells MIRA in which directory it can find
	  SCF data belonging to reads of this read group.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_manifest_readgroups_renameprefix"></a>3.4.3.13. 
	Renaming read name prefixes
      </h4></div></div></div><p>
	  <span class="bold"><strong>rename_prefix</strong></span>=
	  <em class="replaceable"><code>prefix replacement</code></em>. Allows to rename
	  reads on the fly while loading data by searching each read name
	  for a given <span class="emphasis"><em>prefix</em></span> string and, if found,
	  replace it with a given <span class="emphasis"><em>replacement</em></span> string.
	</p><p>
	  This is most useful for systems like Illumina or PacBio which
	  generate quite long read names which, in the end, are either
	  utterly useless for an end user or are even breaking older
	  programs which have a length restriction on read names. E.g.:
	</p><pre class="screen">rename_prefix = DQT9AAQ4:436:H371HABMM: Sample1_</pre><p>
	  will rename reads
	  like <span class="emphasis"><em>DQT9AAQ4:436:H371HABMM:5:1101:9154:3062</em></span>
	  into <span class="emphasis"><em>Sample1_5:1101:9154:3062</em></span>
	</p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><code class="literal">rename_prefix</code> entries are valid per
	  readgroup. I.e., an entry for a readgroup will not rename reads of
	  another readgroup.
	</td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	    Multiple <code class="literal">rename_prefix</code> entries are
	    allowed per readgroup. E.g.:
	  </p><pre class="screen">rename_prefix = DQT9AAQ4:436:H371HABMM: S1sxa_
rename_prefix = m140328_002546_42149_c100624422550000001823118308061414_s1_ S1pb_</pre><p>
	    will rename a read
	    called <code class="literal">DQT9AAQ4:436:H371HABMM:1:1101:3099:2186</code>
	    into <code class="literal">S1sxa_1:1101:3099:2186</code> while renaming
	    another read called <code class="literal">m140328_002546_42149_c100624422550000001823118308061414_s1_p0/100084/10792_20790/0_9573</code>
	    into <code class="literal">S1pb_p0/100084/10792_20790/0_9573</code>
	  </p></td></tr></table></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_manifest_parameters"></a>3.4.4. 
	The manifest file: extended parameters
      </h3></div></div></div><p>
	The <span class="bold"><strong>parameters=</strong></span> line in the manifest
	file opens up the full panoply of possibilities the MIRA assembler
	offers. This ranges from fine-tuning assemblies to setting parameters
	in a way so that MIRA is suited also for very special assembly cases.
      </p><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_parameter_groups"></a>3.4.4.1. 
	  Parameter groups
	</h4></div></div></div><p>
	  Some parameters one can set in MIRA somehow belong together. Example
	  given: when specifying an overlap in an alignment of two sequences,
	  one could tell the assembler it should look at overlaps only if they
	  have a certain similarity and a certain length. On the other hand,
	  specifying how many processors / threads the assembler should use or
	  whether the results of an assembly should be written out as SAM
	  format does not seem to relate to alignments.
	</p><p>
	  MIRA uses <span class="emphasis"><em>parameter groups</em></span> to keep parameters
	  together which somehow belong together. Example given:
	</p><pre class="screen">
<strong class="userinput"><code>parameters = <em class="replaceable"><code> -GENERAL:number_of_threads=4 \
              -ALIGN:min_relative_score=70 -ASSEMBLY:minimum_read_length=150 \
              -OUTPUT:output_result_caf=no</code></em></code></strong></pre><p>
	The parameters of the different parameter groups are described in
	detail a bit later in this manual.
      </p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_technology_sections"></a>3.4.4.2. 
	  Technology sections
	</h4></div></div></div><p>
	  With the introduction of new sequencing technologies, MIRA also had
	  to be able to set values that allow technology specific behaviour of
	  algorithms. One simple example for this could be the minimum length
	  a read must have to be used in the assembly. For Sanger sequences,
	  having this value to be 150 (meaning a read should have at least 150
	  unclipped bases) would be a very valid, albeit conservative
	  choice. For 454 reads and especially Solexa and ABI SOLiD reads
	  however, this value would be ridiculously high.
	</p><p>
	  To allow very fine grained behaviour, especially in hybrid
	  assemblies, and to prevent the explosion of parameter names, MIRA
	  knows two categories of parameters:
	</p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	      <span class="bold"><strong>technology independent parameters</strong></span>
	      which control general behaviour of MIRA like, e.g., the number of
	      assembly passes or file names etc.
	    </p></li><li class="listitem"><p>
	      <span class="bold"><strong>technology dependent parameters</strong></span>
	      which control behaviour of algorithms where the sequencing
	      technology plays a role. Example for this would be the minimum
	      length of a read (like 200 for Sanger reads and 120 for 454 FLX
	      reads).
	    </p></li></ol></div><p>
	  More on this a bit further down in this documentation.
	</p><p>
	  As example, a manifest using technology dependent and independent parameters could
	  look like this:
	</p><pre class="screen">
<strong class="userinput"><code>parameters = <em class="replaceable"><code>COMMON_SETTINGS -GENERAL:number_of_threads=4 \
              SANGER_SETTINGS -ALIGN:min_relative_score=70 -ASSEMBLY:minimum_read_length=150 \
              454_SETTINGS -ALIGN:min_relative_score=75 -ASSEMBLY:minimum_read_length=100 \
              SANGER_SETTINGS -ALIGN:min_relative_score=90 -ASSEMBLY:minimum_read_length=75</code></em></code></strong></pre><p>
	  Now, assume the following read group descriptions in a manifest:
	</p><pre class="screen">
...

readgroup
technology=454
...

readgroup
technology=solexa
...</pre><p>
	  For MIRA, this means a number of parameters should apply to the
	  assembly as whole, while others apply to the sequencing data itself
	  ... and some parameters might need to be different depending on the
	  technology they apply to. MIRA dumps the parameters it is running
	  with at the beginning of an assembly and it makes it clear there
	  which parameters are "global" and which parameters apply to single
	  technologies.
	</p><p>
	  Here is as example a part of the output of used parameters that MIRA
	  will show when started with 454 and Illumina (Solexa) data:
	</p><pre class="screen">
...

Assembly options (-AS):
    Number of passes (nop)                      : 1
    Skim each pass (sep)                        : yes
    Maximum number of RMB break loops (rbl)     : 1
    Spoiler detection (sd)                      : no
    Last pass only (sdlpo)                      : yes

    Minimum read length (mrl)                   :  [454]  40
                                                   [sxa]  20
    Enforce presence of qualities (epoq)        :  [454]  no
                                                   [sxa]  yes

...</pre><p>
	  You can see the two different kind of settings that MIRA uses:
	  <span class="emphasis"><em>common</em></span> <span class="emphasis"><em>settings</em></span> (like
	   [-AS:nop]) which allows only one value and
	  <span class="emphasis"><em>technology</em></span> <span class="emphasis"><em>dependent</em></span>
	  <span class="emphasis"><em>settings</em></span> (like  [-AS:mrl]), where for
	  each sequencing technology used in the project, the setting can be
	  different.
	</p><p>
	  How would one set a minimum read length of 40 and not enforce
	  presence of base qualities for Sanger reads, but for 454 reads a
	  minimum read length of 30 and enforce base qualities? The answer:
	</p><pre class="screen">
job=denovo,genome,draft
parameters= SANGER_SETTINGS -AS:mrl=40:epoq=mo 454_SETTINGS -AS:mrl=40:epoq=yes</pre><p>
	  Notice the ..._SETTINGS section in the command line (or parameter file):
	  these tell MIRA that all the following parameters until the advent of
	  another switch are to be set specifically for the said technology.
	</p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	    For improved readability, you can distribute parameters across
	    several lines either by pre-fixing every line with
	    <code class="literal">parameter=</code>, like so:
	  </p><pre class="screen">
job=denovo,genome,draft
parameters= SANGER_SETTINGS -AS:mrl=80:epoq=no
parameters= 454_SETTINGS -AS:mrl=30:epoq=yes</pre><p>
	    Alternatively you can use a backslash at the end of a parameter
	    line to indicate that the next line is a continuing line, like so:
	  </p><pre class="screen">
job=denovo,genome,draft
parameters= SANGER_SETTINGS -AS:mrl=80:epoq=no <strong class="userinput"><code>\</code></strong>
            454_SETTINGS -AS:mrl=30:epoq=yes</pre><p>
	    Note that the very last line of the parameters settings MUST NOT
	    end with a backslash.
	  </p></td></tr></table></div><p>
	  Beside COMMON_SETTINGS there are currently 6 technology settings available:
	</p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	      SANGER_SETTINGS
	    </p></li><li class="listitem"><p>
	      454_SETTINGS
	    </p></li><li class="listitem"><p>
	      IONTOR_SETTINGS
	    </p></li><li class="listitem"><p>
	      PCBIOLQ_SETTINGS (currently not supported)
	    </p></li><li class="listitem"><p>
	      PCBIOHQ_SETTINGS
	    </p></li><li class="listitem"><p>
	      SOLEXA_SETTINGS
	    </p></li><li class="listitem"><p>
	      TEXT_SETTINGS
	    </p></li></ol></div><p>
	</p><p>
	  Some settings of MIRA are influencing global behaviour and are not
	  related to a specific sequencing technology, these must be set in the
	  COMMON_SETTINGS environment. For example, it would not make sense to try and
	  set different number of assembly passes for each technology like in
	</p><pre class="screen">
<strong class="userinput"><code>parameters= 454_SETTINGS -AS:nop=4 SOLEXA_SETTINGS -AS:nop=3</code></strong></pre><p>
	  Beside being contradictory, this makes not really sense. MIRA will
	  complain about cases like these. Simply set those common settings in
	  an area prefixed with the COMMON_SETTINGS switch like in
	</p><pre class="screen">
<strong class="userinput"><code>parameters= COMMON_SETTINGS -AS:nop=4 454_SETTINGS ... SOLEXA_SETTINGS ...</code></strong></pre><p>
	</p><p>
	  Since MIRA 3rc3, the parameter parser will help you by checking
	  whether parameters are correctly defined as COMMON_SETTINGS or
	  technology dependent setting.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_parameter_shortnames"></a>3.4.4.3. 
	  Parameter short names
	</h4></div></div></div><p>
	  Writing the verbose form of parameters can be quite a long task. Here a short example:
	</p><pre class="screen">
<strong class="userinput"><code>parameters = <em class="replaceable"><code>COMMON_SETTINGS -GENERAL:number_of_threads=4 \
              SANGER_SETTINGS -ALIGN:min_relative_score=70 -ASSEMBLY:minimum_read_length=150 \
              454_SETTINGS -ALIGN:min_relative_score=75 -ASSEMBLY:minimum_read_length=100 \
              SOLEXA_SETTINGS -ALIGN:min_relative_score=90 -ASSEMBLY:minimum_read_length=75</code></em></code></strong></pre><p>
	  However, every parameter has a shortened form. The above could be written like this:
	</p><pre class="screen">
<strong class="userinput"><code>parameters = <em class="replaceable"><code>COMMON_SETTINGS -GE:not=4 \
              SANGER_SETTINGS -AL:mrs=70 -AS:mrl=150 \
              454_SETTINGS -AL:mrs=75 -AS:mrl=100 \
              SOLEXA_SETTINGS -AL:mrs=90 -AS:mrl=75</code></em></code></strong></pre><p>
	Please note that it is also perfectly legal to decompose the switches
	so that they can be used more easily in scripted environments (notice
	the multiple -AL in some sections of the following example):
      </p><pre class="screen">
<strong class="userinput"><code>parameters = <em class="replaceable"><code>COMMON_SETTINGS -GE:not=4 \
              SANGER_SETTINGS \
                -AL:mrs=70 \
		-AL:mrl=150 \
              454_SETTINGS -AL:mrs=75:mrl=100 \
              SOLEXA_SETTINGS \
	        -AL:mrs=90 \
                -AL:mrl=75</code></em></code></strong></pre></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_order_dependent_quick_switches"></a>3.4.4.4. 
	  Order dependent quick switches
	</h4></div></div></div><p>
	  For some parameters, the order of appearance in the parameter lines
	  of the manifest is important. This is because the <span class="emphasis"><em>quick
	  parameters</em></span> are realised internally as a collection of
	  extended parameters that will overwrite any previously manually set
	  extended parameters. It is generally a good idea to place quick parameters in
	  the order as described in this documentation, that is: first the
	  order dependent quick parameters, then other quick parameters, then all
	  the other extended parameters.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [--hirep_best]
	    , </span><span class="term">
	      [--hirep_good]
	    , </span><span class="term">
	      [--hirep_something]
	    </span></dt><dd><p>
		These are modifier switches for genome data that is deemed to
		be highly repetitive. With <span class="emphasis"><em>hirep_good</em></span> and
		<span class="emphasis"><em>hirep_best</em></span>, the assemblies will run
		slower due to more iterative cycles and slightly different
		default parameter sets that give MIRA a chance to resolve many
		nasty repeats. The <span class="emphasis"><em>hirep_something</em></span> switch
		goes the other way round and resolves repeats less well than a
		normal assembly, but allows MIRA to finish even on more
		complex data.
	      </p><p>
		Usage recommendations bacteria: starting MIRA without any
		hirep switches yields good enough result in most cases. Under
		normal circumstances one can use
		<span class="emphasis"><em>hirep_good</em></span> or
		even <span class="emphasis"><em>hirep_best</em></span> without remorse as data
		sets and genome complexities are small enough to run within a
		couple of hours at most.
	      </p><p>
		Usage recommendations for 'simple' lower eukaryotes: starting
		MIRA without any hirep switches yields good enough result in
		most cases. If the genomes are not too complex,
		using <span class="emphasis"><em>hirep_good</em></span> can be a possibility.
	      </p><p>
		Usage recommendations for lower eukaryotes with complex
		repeats: starting MIRA without any hirep switches might
		already take too long or create temporary data files which are
		too big. For these cases, using
		<span class="emphasis"><em>hirep_something</em></span> makes MIRA use a
		parameter set which is targeted as resolving the
		non-repetitive areas of a genome and additionally all repeats
		which occur less than 10 times in the genome. Repeats occurring
		more often will not be resolved, but using the debris
		information one can recover affected reads and use these with
		harsh data reduction algorithms (e.g. digital normalisation)
		to get a glimpse into these.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		These switches replace the '--highlyrepetitive' switch from
		earlier versions.
	      </td></tr></table></div></dd><dt><span class="term">
	      [--noclipping=...]
	    </span></dt><dd><p> Switches off clipping options for given sequencing
	      technologies. Technologies can be <span class="emphasis"><em>sanger</em></span>,
	      <span class="emphasis"><em>454</em></span>, <span class="emphasis"><em>iontor</em></span>,
	      <span class="emphasis"><em>solexa</em></span> or
	      <span class="emphasis"><em>solid</em></span>. Multiple entries separated by comma.
	      </p><p> Note that [-CL:pec] and the chimera clipping
	       [-CL:gbcdc] are not switched off by this parameter and
	      should be switched off separately.
	      </p><p> Examples:
	      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
		    Switch off 454 and Solexa (but keep eventually keep Sanger
		    clipping): <code class="literal">--noclipping=454,solexa</code>
		  </p></li><li class="listitem"><p>
		    Switch off all: <code class="literal">--noclipping</code>
		    or <code class="literal">--noclipping=all</code>
		  </p></li></ol></div></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_general_ge"></a>3.4.4.5. 
	  Parameter group: -GENERAL (-GE)
	</h4></div></div></div><p>
	  General options control the type of assembly to be performed and
	  other switches not belonging anywhere else.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [number_of_threads(not)=<em class="replaceable"><code>0 &#8804; integer &#8804; 256</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">0</span>. Master switch to set the number
	      of threads used in different parts of MIRA.
	      </p><p>
		A value of 0 tells MIRA to set this to the number of available
		physical cores on the machine it runs on. That is,
		hyperthreaded "cores" are not counted in as using these would
		cause a tremendous slowdown in the heavy duty computation
		parts. E.g., a machine with 2 processors having 4 cores each
		will have this value set to 8.
	      </p><p>
		In case MIRA cannot find out the number of cores, the
		fall-back value is <span class="underline">2</span>.
	      </p><p>
		Note: when running the SKIM algorithm in parallel threads,
		MIRA can give different results when started with the same
		data and same arguments. While the effect could be averted for
		SKIM, the memory cost for doing so would be an additional 50%
		for one of the large tables, so this has not been implemented
		at the moment. Besides, at the latest when the Smith-Watermans
		run in parallel, this could not be easily avoided at all.
	      </p></dd><dt><span class="term">
	      [automatic_memory_management(amm)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">Yes</span>. Whether
		MIRA tries to optimise run time of certain algorithms in a
		space/time trade-off memory usage, increasing or reducing some
		internal tables as memory permits.
	      </p><p>
		Note 1: This functionality currently relies on the
		<code class="filename">/proc</code> file system giving information on
		the system memory ("MemTotal" in /proc/meminfo) and the memory
		usage of the current process ("VmSize" in
		<code class="filename">/proc/self/status</code>). If this is not
		available, the functionality is switched off.
	      </p><p>
		Note 2: The automatic memory management can only work if there
		actually is unused system memory. It's not a wonder switch
		which reduces memory consumption. In tight memory situations,
		memory management has no effect and the algorithms fall back
		to minimum table sizes. This means that the effective size in
		memory can grow larger than given in the memory management
		parameters, but then MIRA will try to keep the additional
		memory requirements to a minimum.
	      </p></dd><dt><span class="term">
	      [max_process_size(mps)=<em class="replaceable"><code>0 &#8804; integer</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">0</span>. If
	      automatic memory management is used (see above), this number is
	      the size in gigabytes that the MIRA process will use as maximum
	      target size when looking for space/time trade-offs. A value of 0
	      means that MIRA does not try keep a fixed upper limit.
	      </p><p>
		Note: when in competition to [-GE:kpmf] (see below),
		the smaller of both sizes is taken as target. Example: if your
		machine has 64 GiB but you limit the use to 32 GiB, then the
		MIRA process will try to stay within these 32 GiB.
	      </p></dd><dt><span class="term">
	      [keep_percent_memory_free(kpmf)=<em class="replaceable"><code>0 &#8804; integer</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">10</span>. If
	      automatic memory management is used (see above), this number
	      works a bit like  [-GE:mps] but the other way round: it
	      tries to keep x percent of the memory free.
	      </p><p>
		Note: when in competition to [-GE:mps] (see above),
		the argument leaving the most memory free is taken as
		target. Example: if your machine has 64 GiB and you limit the
		use to 42 GiB via  [-GE:mps] but have a
		 [-GE:kpmf] of 50, then the MIRA process will try to
		stay within 64-(64*50%)=32 GiB.
	      </p></dd><dt><span class="term">
	      [preprocess_only(ppo)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">no</span> As a
	      special use case, MIRA will just run the following tasks:
	      loading and clipping of reads as well as calculating kmer
	      frequencies and read repeat information. The resulting reads can
	      then be found as MAF file in the checkpoint directory; the read
	      repeat information in the info directory.
	      </p><p>
		No assembly is performed.
	      </p></dd><dt><span class="term">
	      [est_snp_pipeline_step(esps)=<em class="replaceable"><code>1 &#8804; integer &#8804; 4</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">1</span>. Controls the starting step of the
	      SNP search in EST pipeline and is therefore only useful in
	      miraSearchESTSNPs.
	      </p><p>
		EST assembly is a three step process, each with different
		settings to the assembly engine, with the result of each step
		being saved to disk. If results of previous steps are present
		in a directory, one can easily "play around" with different
		setting for subsequent steps by reusing the results of the
		previous steps and directly starting with step two or three.
	      </p></dd><dt><span class="term">
	      [print_date(pd)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>. Controls
		whether date and time are printed out during the
		assembly. Suppressing it is not useful in normal operation,
		only when debugging or benchmarking.
	      </p></dd><dt><span class="term">
	      [bang_on_throw(bot)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>. For
		debugging purposes only. Controls whether MIRA raises a signal
		when detecting an error which triggers a running debugger like
		gdb.
	      </p></dd></dl></div><p>
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_assembly_as"></a>3.4.4.6. 
	  Parameter group: -ASSEMBLY (-AS)
	</h4></div></div></div><p>
	  General options for controlling the assembly.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [num_of_passes(nop)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">0</span>. Defines how many iterations of the whole
		assembly process are done.
	      </p><p>
		The default of 0 will let MIRA choose automatically the number
		of passes and the kmer sizes used in each pass
		(see also [-AS:kms] below).
	      </p><p>
		Early termination: if the number of passes was chosen too
		high, one can simply create a file
		<code class="filename"><em class="replaceable"><code>projectname</code></em>_assembly/<em class="replaceable"><code>projectname</code></em>_d_chkpt/terminate</code>. At
		the beginning of a new pass, MIRA checks for the existence of
		that file and, if it finds it, acknowledges by renaming it to
		<code class="filename">terminate_acknowledged</code> and then run 2
		more passes (with special "last pass routines") before
		finishing the assembly.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		As a rule of thumb, <span class="emphasis"><em>de-novo</em></span> assemblies
		should always have at least two passes,
		while <span class="emphasis"><em>mapping</em></span> assemblies should work with
		only one pass. Not doing this will lead to results unexpected
		by users. The reason is that the MIRA the learning routines
		either have no chance to learn enough about the assembly (for
		de-novo with one pass) or learn "too much" (mapping with more
		than one pass).
	      </td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		MIRA versions &#8804; 4.0.2 were interpreting the value of '0' in
		a different way and only performed pre-processing of
		reads. MIRA can still do this, but this is controlled by the
		new parameter [-GE:ppo].
	      </td></tr></table></div></dd><dt><span class="term">
	      [kmer_series(kms)=<em class="replaceable"><code>comma separated list of integers &#8805; 0 and &#8804; 256</code></em>]
	    </span></dt><dd><p>
		Default is an empty value. If set, overrides [-AS:nop] and  [-SK:kms].
	      </p><p>
		If set, this parameter provides a one-stop-shop for defining the number of passes and the kmer size used in each pass. E.g.: <code class="literal">-AS:kms=17,31,63,127</code> defines an assembly with 4 passes which uses a kmer size of 17 in pass 1, 31 in pass 2, 63 in pass 3 and 127 in pass 4.
	      </p><p>
		Note that it is perfectly valid to use the same kmer size more than once, e.g.: <code class="literal">17,31,63,63</code> will perform a 4 pass assembly, using a kmer size of 63 in passes 3 and 4. It also makes sense to do this, as with default parameters MIRA uses its integrated automatic editor which edits away obvious sequencing errors in each step, thus the second pass with a kmer size of 63 bases can rely on improved reads.
	      </p></dd><dt><span class="term">
	      [rmb_break_loops(rbl)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology and assembly
		quality level. Defines the maximum number of times a contig
		can be rebuilt during a main assembly pass
		(see [-AS:nop] or  [-AS:kms]) if misassemblies due to possible repeats
		are found.
	      </p></dd><dt><span class="term">
	      [max_contigs_per_pass(mcpp)=<em class="replaceable"><code>integer</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">0</span>. Defines
		how many contigs are maximally built in each pass. A value of
		0 stands for 'unlimited'. Values &gt;0 can be used for special
		use cases like test assemblies etc.
	      </p><p>
		If in doubt, do not touch this parameter.
	      </p></dd><dt><span class="term">
	      [automatic_repeat_detection(ard)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is is currently <span class="underline">yes</span>. Tells MIRA to use coverage
		information accumulated over time to more accurately pinpoint reads that are
		in repetitive regions.
	      </p></dd><dt><span class="term">
	      [coverage_threshold(ardct)=<em class="replaceable"><code>float &gt; 1.0</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">2.0</span> for all sequencing technologies in most assembly cases. This
	      option says this: if MIRA a read has ever been aligned at positions
	      where the total coverage of all reads of the same sequencing technology
	      attained the average coverage times  [-AS:ardct] (over a length of
	       [-AS:ardml], see below), then this read is considered to be
	      repetitive.
	      </p></dd><dt><span class="term">
	      [min_length(ardml)=<em class="replaceable"><code>integer &gt; 1</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology, currently
		<span class="underline">400</span> for Sanger and
		<span class="underline">200</span> for 454 and Ion
		Torrent.
	      </p><p>
		A coverage must be at least this number of bases higher than
		[-AS:ardct] before being really treated as repeat.
	      </p></dd><dt><span class="term">
	      [grace_length(ardgl)=<em class="replaceable"><code>integer &gt; 1</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology.
	      </p></dd><dt><span class="term">
	      [uniform_read_distribution(urd)=<em class="replaceable"><code>on|y[es]|t[rue],
	      off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is currently always <span class="underline">no</span>
		as these algorithms were supplanted by better ones in MIRA 4.0.
	      </p><p>
		Takes effect only if uniform read distribution
		([-AS:urd]) is on.
	      </p><p>
		When set to <span class="underline">yes</span>, MIRA
		will analyse coverage of contigs built at a certain stage of
		the assembly and estimate an average expected coverage of
		reads for contigs. This value will be used in subsequent
		passes of the assembly to ensure that no part of the contig
		gets significantly more read coverage of reads that were
		previously identified as repetitive than the estimated average
		coverage allows for.
	      </p><p>
		This switch is useful to disentangle repeats that are
		otherwise 100% identical and generally allows to build larger
		contigs. It is expected to be useful for Sanger and 454
		sequences. Usage of this switch with Solexa and Ion Torrent
		data is currently not recommended.
	      </p><p>
		It is a real improvement to disentangle repeats, but has the
		side-effect of creating some "contig debris" (small and low
		coverage contigs, things you normally can safely throw away as
		they are representing sequence that already has enough
		coverage).
	      </p><p>
		This switch must be set to <span class="underline">no</span> for EST assembly, assembly of
		transcripts etc. It is recommended to also switch this off for
		mapping assemblies.
	      </p></dd><dt><span class="term">
	      [urd_startinpass(urdsip)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology and assembly
		quality level. Recommended values are: 3 for an assembly with
		3 to 4 passes ([-AS:nop]). Assemblies with 5 passes
		or more should set the value to the number of passes minus 2.
	      </p><p>
		Takes effect only if uniform read distribution
		([-AS:urd]) is on.
	      </p></dd><dt><span class="term">
	      [urd_clipoffmultiplier(urdcm)=<em class="replaceable"><code>float &gt; 1.0</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">1.5</span> for all
	        sequencing technologies in most assembly cases.
	      </p><p>
		This option says this: if MIRA determined that the average
		coverage is <span class="emphasis"><em>x</em></span>, then in subsequent passes it will allow
		coverage for reads determined to be repetitive to be built
		into the contig only up to a total coverage of
		<span class="emphasis"><em>x*urdcm</em></span>. Reads that bring the coverage above the threshold
		will be rejected from that specific place in the contig (and
		either be built into another copy of the repeat somewhere else
		or end up as contig debris).
	      </p><p>
		Please note that the lower [-AS:urdcm] is, the more
		contig debris you will end up with (contigs with an average
		coverage less than half of the expected coverage, mostly short
		contigs with just a couple of reads).
	      </p><p>
		Takes effect only if uniform read distribution ([-AS:urd]) is on.
	      </p></dd><dt><span class="term">
	      [spoiler_detection(sd)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology and assembly
		quality level. A spoiler can be either a chimeric read or it
		is a read with long parts of unclipped vector sequence still
		included (that was too long for the [-CL:pvc] vector
		leftover clipping routines). A spoiler typically prevents
		contigs to be joined, MIRA will cut them back so that they
		represent no more harm to the assembly.
	      </p><p>
		Recommended for assemblies of mid- to high-coverage genomic
		assemblies, not recommended for assemblies of ESTs as one
		might loose splice variants with that.
	      </p><p>
		A minimum number of two assembly passes ([-AS:nop])
		must be run for this option to take effect.
	      </p></dd><dt><span class="term">
	      [sd_last_pass_only(sdlpo)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>. Defines
		whether the spoiler detection algorithms are run only for the
		last pass or for all passes ( [-AS:nop]).
	      </p><p>
		Takes effect only if spoiler detection ([-AS:sd]) is on. If in
		doubt, leave it to 'yes'.
	      </p></dd><dt><span class="term">
	      [minimum_read_length(mrl)=<em class="replaceable"><code>integer &#8805; 20</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology. Defines the minimum length that
	      reads must have to be considered for the assembly. Shorter sequences will be
	      filtered out at the beginning of the process and won't be present in the
	      final project.
	      </p></dd><dt><span class="term">
	      [minimum_reads_per_contig(mrpc)=<em class="replaceable"><code>integer &#8805; 1</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology and the
		[--job] parameter. For genome assemblies it's usually
		around <span class="underline">2</span> for Sanger,
		<span class="underline">5</span> for 454, <span class="underline">5</span> for Ion Torrent, <span class="underline">5</span> for PacBio and <span class="underline">10</span> for Solexa. In EST assemblies,
		it's currently <span class="underline">2</span> for all
		sequencing technologies.
	      </p><p>
		Defines the minimum number of reads a contig must have before
		it is built or saved by MIRA. Overlap clusters with less reads
		than defined will not be assembled into contigs but reads in
		these clusters will be immediately transferred to debris.
	      </p><p>
		This parameter is useful to considerably reduce assembly time
		in large projects with millions of reads (like in Solexa
		projects) where a lot of small "junk" contigs with
		contamination sequence or otherwise uninteresting data may be
		created otherwise.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Important: a value larger 1 of this parameter interferes with
		the functioning of [-OUT:sssip] and
		 [-OUT:stsip].
	      </td></tr></table></div></dd><dt><span class="term">
	      [enforce_presence_of_qualities(epoq)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>. When set
		to yes, MIRA will stop the assembly if any read has no quality
		values loaded.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">[-AS:epoq] switches on/off the quality check for a
		complete sequencing technology. A more fine grained control
		for switching checks of per readgroup is available via
		the <span class="emphasis"><em>default_qual</em></span> readgroup parameter in
		the manifest file.
	      </td></tr></table></div></dd><dt><span class="term">
	      [use_genomic_pathfinder(ugpf)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>. MIRA has
		two different pathfinder algorithms it chooses from to find
		its way through the (more or less) complete set of possible
		sequence overlaps: a genomic and an EST pathfinder. The
		genomic looks a bit into the future of the assembly and tries
		to stay on safe grounds using a maximum of information already
		present in the contig that is being built. The EST version on
		the contrary will directly jump at the complex cases posed by
		very similar repetitive sequences and try to solve those first
		and is willing to fall back to first-come-first-served when
		really bad cases (like, e.g., coverage with thousands of
		sequences) are encountered.
	      </p><p>
		Generally, the genomic pathfinder will also work quite well
		with EST sequences (but might get slowed down a lot in
		pathological cases), while the EST algorithm does not work so
		well on genomes. If in doubt, leave on <span class="underline">yes</span> for genome projects and set to
		<span class="underline">no</span> for EST projects.
	      </p></dd><dt><span class="term">
	      [use_emergency_search_stop(uess)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>. Another
		important switch if you plan to assemble non-normalised EST
		libraries, where some ESTs may reach coverages of several
		hundreds or thousands of reads. This switch lets MIRA save a
		lot of computational time when aligning those extremely high
		coverage areas (but only there), at the expense of some
		accuracy.
	      </p></dd><dt><span class="term">
	      [ess_partnerdepth(esspd)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">500</span>. Defines the number of potential
	      partners a read must have for MIRA switching into emergency
	      search stop mode for that read.
	      </p></dd><dt><span class="term">
	      [use_max_contig_buildtime(umcbt)=<em class="replaceable"><code>on|y[es]|t[rue],off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>. Defines whether there is an upper limit of time
		to be used to build one contig. Set this to yes in EST assemblies where you
		think that extremely high coverages occur. Less useful for assembly of
		genomic sequences.
	      </p></dd><dt><span class="term">
	      [buildtime_in_seconds(bts)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">3600</span> for genome
	      assemblies, <span class="underline">720</span> for EST
	      assemblies with Sanger or 454
	      and <span class="underline">360</span> for EST assemblies
	      with Solexa or Ion Torrent. Depending on  [-AS:umcbt]
	      above, this number defines the time in seconds allocated to
	      building one contig.
	      </p></dd></dl></div><p>
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_strain_backbone_sb"></a>3.4.4.7. 
	  Parameter group: -STRAIN/BACKBONE (-SB)
	</h4></div></div></div><p>
	  Controlling backbone options in mapping assemblies:
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [bootstrap_new_backbone(bnb)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span> for
		mapping assemblies with Illumina data, no otherwise.
	      </p><p>
		When set to 'yes', MIRA will use a two stage mapping process
		which bootstraps an intermediate backbone (reference) sequence
		and greatly improves mapping accuracy at indel sites.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Currently only works with Illumina data, other sequencing
		technologies will not be affected by this flag.
	      </td></tr></table></div></dd><dt><span class="term">
	      [startbackboneusage_inpass(sbuip)=<em class="replaceable"><code>0 &lt; integer</code></em>]
	    </span></dt><dd><p> Default is
	      dependent on assembly quality level chosen: 0 for 'draft'
	      and [-AS:nop] divided by 2 for 'accurate'.
	      </p><p>
		When assembling against backbones, this parameter defines the
		pass iteration (see [-AS:nop]) from which on the
		backbones will be really used. In the passes preceding this
		number, the non-backbone reads will be assembled together as
		if no backbones existed.  This allows MIRA to correctly spot
		repetitive stretches that differ by single bases and tag them
		accordingly. Note that full assemblies are considerably slower
		than mapping assemblies, so be careful with this when
		assembling millions of reads.
	      </p><p>
		Rule of thumb: if backbones belong to same strain as reads to assemble, set
		to <span class="underline">1</span>. If backbones are a different strain, then set
		 [-SB:sbuib] to 1 lower than  [-AS:nop] (example: nop=4 and
		sbuip=3).
	      </p></dd><dt><span class="term">
	      [backbone_raillength(brl)=<em class="replaceable"><code>0 &#8804; integer &#8804; 10000</code></em>]
	    </span></dt><dd><p> Default
	      is <span class="underline">0</span>. Parameter for the
	      internal sectioning size of the backbone to compute optimal
	      alignments. Should be set to two times length of longest read in
	      input data + 15%. When set to 0, MIRA will compute optimal
	      values from the data loaded.
	      </p></dd><dt><span class="term">
	      [backbone_railoverlap(bro)=<em class="replaceable"><code>0 &#8804; integer &#8804; 2000</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">0</span>.
	      Parameter for the internal sectioning size of the backbone to
	      compute optimal alignments. Should be set to length of the
	      longest read. When set to 0, MIRA will compute optimal values
	      from the data loaded.
	      </p></dd><dt><span class="term">
	      [trim_overhanging_reads(tor)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>.
	      </p><p>
		When set to 'yes', MIRA will trim back reads at end of contigs
		which outgrow the reference sequence so that boundaries of
		the reference and the mapped reads align perfectly. That is,
		the mapping does not perform a sequence extension.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		The trimming is performed via setting low quality cutoffs in
		the reads, i.e., the trimmed parts are not really gone but
		just not part of the active contig anymore. They can be
		uncovered when working on the assembly in finishing programs
		like, e.g., <span class="command"><strong>gap4</strong></span>
		or <span class="command"><strong>gap5</strong></span>.
	      </td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		Previous versions of MIRA (up to and including 3.9.18) behaved
		as if this option had been set to 'no'. This is a major change
		in behaviour, but it is also what probably most people expect
		from a mapping.
	      </td></tr></table></div></dd><dt><span class="term">
	      [also_build_new_contigs(abnc)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>. Standard
		mapping assembly mode of the assembler is to map available
		reads to a backbone and discard reads that do not fit.  If set
		to 'yes', MIRA will use reads that did not map to the
		backbone(s) to make new contigs (if possible). Please note:
		while a simple mapping assembly is comparatively cheap in
		terms of memory and time consumed, setting this option to
		'yes' means that behind the scenes data for a full blown
		de-novo assembly is generated in addition to the data needed
		for a mapping assembly. This means, that in terms of memory
		consumption and speed, this switch combines the worst of both
		worlds.
	      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		Using this switch is not recommended. Beside the memory and
		speed considerations, a lot of different algorithms cannot
		work optimally in this mode. I recommend to use a two step
		approach instead: first map with MIRA, then assemble de-novo
		all reads which did not map. This will lead more often than
		not to the results expected (and in shorter time).
	      </td></tr></table></div></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_dataprocessing_dp"></a>3.4.4.8. 
	  Parameter group: -DATAPROCESSING (-DP)
	</h4></div></div></div><p>
	  Options for controlling some data processing during the assembly.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [use_read_extension(ure)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default
	      is dependent of the sequencing technology used: <span class="underline">yes</span> for Sanger,
	      no for all others. MIRA expects the sequences it is given to be
	      quality clipped. During the assembly though, it will try to extend reads
	      into the clipped region and gain additional coverage by analysing
	      Smith-Waterman alignments between reads that were found to be valid. Only
	      the right clip is extended though, the left clip (most of the time
	      containing sequencing vector) is never touched.
	      </p></dd><dt><span class="term">
	      [read_extension_window_length(rewl)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default
	      is dependent of the sequencing technology used. Only takes effect when
	      [-DP:ure] (see above) is set to <span class="underline">yes</span>. The read extension
	      routines use a sliding window approach on Smith-Waterman alignments. This
	      parameter defines the window length.
	      </p></dd><dt><span class="term">
	      [read_extension_with_maxerrors(rewme)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology used. Only takes effect
		when [-DP:ure] (see above) is set to <span class="underline">yes</span>. The read
		extension routines use a sliding window approach on Smith-Waterman
		alignments. This parameter defines the number maximum number of errors
		(=disagreements) between two alignment in the given window.
	      </p></dd><dt><span class="term">
	      [first_extension_in_pass(feip)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology used. Only takes effect when
	      [-DP:ure] (see above) is set to <span class="underline">yes</span>. The read extension
	      routines can be called before assembly and/or after each assembly pass (see
	       [-AS:nop]). This parameter defines the first pass in which the read
	      extension routines are called. The default of <span class="underline">0</span> tells
	      MIRA to extend the reads the first time before the first assembly
	      pass.
	      </p></dd><dt><span class="term">
	      [last_extension_in_pass(leip)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology used. Only takes effect when
	      [-DP:ure] (see above) is set to <span class="underline">yes</span>. The read extension
	      routines can be called before assembly and/or after each assembly pass (see
	       [-AS:nop]). This parameter defines the last pass in which the read
	      extension routines are called. The default of <span class="underline">0</span> tells
	      MIRA to extend the reads the last time before the first assembly
	      pass.
	      </p></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_clipping_cl"></a>3.4.4.9. 
	  Parameter group: -CLIPPING (-CL)
	</h4></div></div></div><p>
	  Controls for clipping options: when and how sequences should be clipped.
	</p><p>
	  Every option in this section can be set individually for every sequencing
	  technology, giving a very fine grained control on how reads are clipped for
	  each technology.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [msvs_gap_size(msvsgs)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology used. Takes
		effect only when loading data from ancillary SSAHA2 or SMALT
		files.
	      </p><p>
		While performing the clip of screened vector sequences, MIRA
		will look if it can merge larger chunks of sequencing vector
		bases that are a maximum of [-CL:msvgsgs] apart.
	      </p></dd><dt><span class="term">
	      [msvs_max_front_gap(msvsmfg)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology used. Takes
		effect only when loading data from ancillary SSAHA2 or SMALT
		files.
	      </p><p>
		While performing the clip of screened vector sequences at the
		start of a sequence, MIRA will allow up to this number of
		non-vector bases in front of a vector stretch.
	      </p></dd><dt><span class="term">
	      [msvs_max_end_gap(msvsmeg)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology used. Takes
		effect only when loading data from ancillary SSAHA2 or SMALT
		files.
	      </p><p>
		While performing the clip of screened vector sequences at the
		end of a sequence, MIRA will allow up to this number of
		non-vector bases behind a vector stretch.
	      </p></dd><dt><span class="term">
	      [possible_vector_leftover_clip(pvlc)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology
		used: <span class="underline">yes</span> for
		Sanger, <span class="underline">no</span> for any
		other. MIRA will try to identify possible sequencing vector
		relics present at the start of a sequence and clip them
		away. These relics are usually a few bases long and were not
		correctly removed from the sequence in data preprocessing
		steps of external programs.
	      </p><p>
		You might want to turn off this option if you know (or think)
		that your data contains a lot of repeats and the option below
		to fine tune the clipping behaviour does not give the expected
		results.
	      </p><p>
		You certainly want to turn off this option in EST assemblies
		as this will quite certainly cut back (and thus hide)
		different splice variants. But then make certain that you
		pre-processing of Sanger data (sequencing vector removal) is
		good, other sequencing technologies are not affected then.
	      </p></dd><dt><span class="term">
	      [pvc_maxlenallowed(pvcmla)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is dependent of the sequencing technology
	      used. The clipping of possible vector relics option works quite
	      well. Unfortunately, especially the bounds of repeats or
	      differences in EST splice variants sometimes show the same
	      alignment behaviour than possible sequencing vector relics and
	      could therefore also be clipped.
	      </p><p>
		To refrain the vector clipping from mistakenly clip repetitive
		regions or EST splice variants, this option puts an upper
		bound to the number of bases a potential clip is allowed to
		have. If the number of bases is below or equal to this
		threshold, the bases are clipped. If the number of bases
		exceeds the threshold, the clip
		is <span class="bold"><strong>NOT</strong></span> performed.
	      </p><p>
		Setting the value to 0 turns off the threshold, i.e., clips are then always
		performed if a potential vector was found.
	      </p></dd><dt><span class="term">
	      [quality_clip(qc)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">no</span>. This will let MIRA
	      perform its own quality clipping before sequences are entered
	      into the assembly. The clip function performed is a sequence end
	      window quality clip with back iteration to get a maximum number
	      of bases as useful sequence. Note that the bases clipped away
	      here can still be used afterwards if there is enough evidence
	      supporting their correctness when the option  [-DP:ure]
	      is turned on.
	      </p><p>
		Warning: The windowing algorithm works pretty well for Sanger,
		but apparently does not like 454 type data. It's advisable to
		not switch it on for 454. Beside, the 454 quality clipping
		algorithm performs a pretty decent albeit not perfect job, so
		for genomic 454 data (not! ESTs), it is currently recommended
		to use a combination of [-CL:emrc] and
		 [-DP:ure].
	      </p></dd><dt><span class="term">
	      [qc_minimum_quality(qcmq)=<em class="replaceable"><code>integer &#8805; 15 and &#8804; 35</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology used. This is the minimum
		quality bases in a window require to be accepted. Please be cautious not to
		take too extreme values here, because then the clipping will be too lax or
		too harsh. Values below 15 and higher than 30-35 are not recommended.
	      </p></dd><dt><span class="term">
	      [qc_window_length(qcwl)=<em class="replaceable"><code>integer &#8805; 10</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology used. This is the length of a window
	      in bases for the quality clip.
	      </p></dd><dt><span class="term">
	      [bad_stretch_quality_clip (bsqc)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">no</span>. This
		option allows to clip reads that were not correctly preprocess
		and have unclipped bad quality stretches that might prevent a
		good assembly.
	      </p><p> MIRA will search the sequence in forward direction for a
		stretch of bases that have in average a quality less than a
		defined threshold and then set the right quality clip of this
		sequence to cover the given stretch.
	      </p></dd><dt><span class="term">
	      [bsqc_minimum_quality (bsqcmq)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is dependent
	      of the sequencing technology used. Defines the minimum average quality a
	      given window of bases must have. If this quality is not reached, the
	      sequence will be clipped at this position.
	      </p></dd><dt><span class="term">
	      [bsqc_window_length (bsqcwl)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is dependent of the
	      sequencing technology used. Defines the length of the window within which
	      the average quality of the bases are computed.
	      </p></dd><dt><span class="term">
	      [maskedbases_clip(mbc)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology used. This will let MIRA
	      perform a 'clipping' of bases that were masked out (replaced with the
	      character X). It is generally not a good idea to use mask bases to remove
	      unwanted portions of a sequence, the EXP file format and the NCBI traceinfo
	      format have excellent possibilities to circumvent this. But because a lot of
	      preprocessing software are built around cross_match, scylla-
	      and phrap-style of base masking, the need arose for MIRA to
	      be able to handle this, too.  MIRA will look at the start and end of
	      each sequence to see whether there are masked bases that should be
	      'clipped'.
	      </p></dd><dt><span class="term">
	      [mbc_gap_size(mbcgs)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is dependent of
	      the sequencing technology used. While performing the clip of masked bases,
	      MIRA will look if it can merge larger chunks of masked bases that are
	      a maximum of [-CL:mbcgs] apart.
	      </p></dd><dt><span class="term">
	      [mbc_max_front_gap(mbcmfg)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology used. While performing the clip of
	      masked bases at the start of a sequence, MIRA will allow up to this
	      number of unmasked bases in front of a masked stretch.
	      </p></dd><dt><span class="term">
	      [mbc_max_end_gap(mbcmeg)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology used. While performing the clip of
	      masked bases at the end of a sequence, MIRA will allow up to this
	      number of unmasked bases behind a masked stretch.
	      </p></dd><dt><span class="term">
	      [lowercase_clip_front(lccf)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology used: on for 454 and Ion
	      Torrent data, off for all
	      others. This will let MIRA perform a 'clipping' of bases that are in
	      lowercase at the front end of a sequence, leaving only the uppercase
	      sequence. Useful when handling 454 data that does not have ancillary data in
	      XML format.
	      </p></dd><dt><span class="term">
	      [lowercase_clip_back(lccb)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology used: on for 454 and Ion
	      Torrent data, off for all
	      others. This will let MIRA perform a 'clipping' of bases that are in
	      lowercase at the back end of a sequence, leaving only the uppercase
	      sequence. Useful when handling 454 data that does not have ancillary data in
	      XML format.
	      </p></dd><dt><span class="term">
	      [clip_polyat(cpat)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">yes</span> for all EST/RNASeq
	      assemblies. Poly-A stretches in forward reads and poly-T
	      stretches in reverse reads get either clipped or tagged here
	      (see  [-CL:cpkps] below). The assembler will not use
	      these stretches for finding overlaps, but it will use these to
	      discern and disassemble different 3' UTR endings.
	      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		Should poly-A / poly-T stretches have been trimmed in
		pre-processing steps before MIRA got the reads, this option
		MUST be switched off.
	      </td></tr></table></div></dd><dt><span class="term">
	      [cp_keep_poly_stretch (cpkps)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">yes</span> but takes effect only
	      if  [-CL:cpat] (see above) is also set to yes.
	      </p><p>
		Instead of clipping the poly-A / poly-T sequence away, the
		stretch in question in the reads is kept and tagged. The tags
		provide additional information for MIRA to discern between
		different 3' UTR endings and alse a good visual anchor when
		looking at the assembly with different programs.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		One side-effect of this option is that the poly-A / poly-T
		stretch are 'cleaned'. That is, single non-poly A / poly-T
		bases within the stretch are automatically edited to be
		conforming to the surrounding stretch. This is necessary as
		homopolymers are by nature one of the hardest motifs to be
		sequenced correctly by any sequencing technology and one
		frequently gets 'dirty' poly-A sequence from sequencing and
		this interferes heavily with the methods MIRA uses to discern
		repeats.
	      </td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Keeping the poly-A sequence is a two-edged sword: on one hand it
		enabled to discern different 3' UTR endings, on the other hand
		it might be that sequencing problems toward the end of reads
		create false-positive different endings. If you find that this
		is the case for your data, just switch off this option: MIRA
		will then simply build the longest possible 3' UTRs.
	      </td></tr></table></div></dd><dt><span class="term">
	      [cp_min_sequence_len(cpmsl)=<em class="replaceable"><code>integer &gt;
	      0</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">10</span>. Only takes effect
	      when  [-CP:cpat] (see above) is set
	      to <span class="underline">yes</span>. Defines the number
	      of 'A' (in forward direction) or 'T' (in reverse direction) must
	      be present to be considered a poly-A sequence stretch.
	      </p></dd><dt><span class="term">
	      [cp_max_errors_allowed(cpmea)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">1</span>. Only takes effect
	      when  [-CL:cpat] (see above) is set
	      to <span class="underline">yes</span>. Defines the
	      maximum number of errors allowed in the potential poly-A
	      sequence stretch. The distribution of these errors is not
	      important.
	      </p></dd><dt><span class="term">
	      [cp_max_gap_from_end(cpmgfe)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">9</span>. Only
	      takes effect when  [-CL:cpat] (see above) is set
	      to <span class="underline">yes</span>.Defines the number
	      of bases from the end of a sequence (if masked: from the end of
	      the masked area) within which a poly-A sequence stretch is
	      looked for.
	      </p></dd><dt><span class="term">
	      [clip_3ppolybase (c3pp)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd>
	      c3p* options to be described ...
	    </dd><dt><span class="term">
	      [clip_known_adaptorsright (ckar)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>. Defines
		whether MIRA should search and clip known sequencing technology
		specific sequencing adaptors. MIRA knows adaptors for Illumina
		best, followed by Ion Torrent and some 454 adaptors.
	      </p><p>
		As the list of known adaptors changes quite frequently, the
		best place to get a list of known adaptors by MIRA is by
		looking at the text files in the program
		sources: <code class="filename">src/mira/adaptorsforclip.*.xxd</code>.
	      </p></dd><dt><span class="term">
	      [ensure_minimum_left_clip(emlc)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology used. If on, ensures a
		minimum left clip on each read according to the parameters in
		[-CL:mlcr:smlc]
	      </p></dd><dt><span class="term">
	      [minimum_left_clip_required(mlcr)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default
	      is dependent of the sequencing technology used. If [-CL:emlc] is
	      on, checks whether there is a left clip which length is at least the one
	      specified here.
	      </p></dd><dt><span class="term">
	      [set_minimum_left_clip(smlc)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology used. If [-CL:emlc] is on
	      and actual left clip is &lt;  [-CL:mlcr], set left clip of read to
	      the value given here.
	      </p></dd><dt><span class="term">
	      [ensure_minimum_right_clip(emrc)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology used. If on, ensures a
		minimum right clip on each read according to the parameters in
		[-CL:mrcr:smrc]
	      </p></dd><dt><span class="term">
	      [minimum_right_clip_required(mrcr)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default
	      is dependent of the sequencing technology used. If [-CL:emrc] is
	      on, checks whether there is a right clip which length is at least the one
	      specified here.
	      </p></dd><dt><span class="term">
	      [set_minimum_right_clip(smrc)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology used. If [-CL:emrc] is on
	      and actual right clip is &lt;  [-CL:mrcr], set the length of the
	      right clip of read to the value given here.
	      </p></dd><dt><span class="term">
	      [gb_chimeradetectionclip(gbcdc)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span> for all jobs.
	      </p><p>
		Very safe chimera detection, should have no false
		positives. For repetitive data, a low number of false
		negatives is possible.
	      </p></dd><dt><span class="term">
	      [kmerjunk_detection(kjd)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is currently <span class="underline">yes</span>.
	      </p><p>
		Reads that look "fishy" are marked as potentially
		chimeric. This mark leads either to a read being completely
		killed or to a read being included into a contig only if no
		other possibility remains.
	      </p><p>
		It is currently suggested to leave this parameter switched on
		and to fine-tune via [-CL:kjck] (see below).
	      </p></dd><dt><span class="term">
	      [kmerjunk_completekill(kjck)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is currently <span class="underline">no</span>
		for genome assemblies and <span class="underline">yes</span> for EST/RNASeq assemblies.
	      </p><p>
		If set to yes, reads marked as junk (see above) are completely
		removed from an assembly. If set to no, reads are not removed
		but included only into a contig as a very last resort.
	      </p><p>
		Having reads killed guarantees assemblies of extremely high
		quality containing virtually no missassembly due to chimeric
		sequencing errors. The downside is that, computationally,
		there is no difference between junk and stretches with correct
		but very low coverage data (generally &lt; 3x coverage). It's
		up to you to decide what is more important: total accuracy or
		longer contigs.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
		  As a rule of thumb: I set this to no for genome assemblies
		  with at least medium average coverage (&#8805; 20-30x) as MIRA
		  does a pretty good job to incorporate these reads so late in
		  an assembly that they do not lead to misassemblies. In
		  transcript assemblies I set this to yes as there is a high
		  chance that high coverage transcripts could be extended via
		  chimeric reads.
		</p><p>
		  With this in mind: deciding for metagenome assemblies would
		  be really difficult though. It probably depends on what you
		  need the data for.
		</p></td></tr></table></div></dd><dt><span class="term">
	      [propose_end_clips(pec)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is is dependent on --job quality: currently <span class="underline">yes</span> for all genome assemblies.
		Switched off for EST assemblies (but one might want to switch
		it on sometimes).
	      </p><p>
		This implements a pretty powerful strategy to ensure a good
		"high confidence region" (HCR) in reads, basically eliminating
		99.9% of all junk at the 5' and 3' ends of reads. Note that
		one still must ensure that sequencing vectors (Sanger) or
		adaptor sequences (454, Solexa ion Torrent) are "more or less"
		clipped prior to assembly.
	      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		Extremely effective, but should NOT be used for very low
		coverage genomic data, or for EST projects if one wants to
		retain the rareest transcripts.
	      </td></tr></table></div></dd><dt><span class="term">
	      [handle_solexa_ggcxg_problem(pechsgp)=<em class="replaceable"><code>on|y[es]|t[rue],
	      off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is is dependent <span class="underline">yes</span>.
	      </p><p>
		Solexa data has a pretty awful problem with in some reads when
		a <code class="literal">GGCxG</code> motif occurs (read more about it in
		the chapter on Solexa data). In short: the sequencing errors
		produced by this problem lead to many false positive SNP
		discoveries in mapping assemblies or problems in contig
		building in de-novo assembly.
	      </p><p>
		MIRA knows about this problem and can look for it in Solexa
		reads during the proposed end clipping and further clip back
		the reads, greatly minimising the impact of this problem.
	      </p></dd><dt><span class="term">
	      [pec_kmer_size(peckms)=<em class="replaceable"><code>10 &#8804; integer &#8804; 32</code></em>]
	    </span></dt><dd><p>
		Default is is dependent on technology and quality in the --job
		switch: usually
		between <span class="underline">17</span>
		and <span class="underline">21</span> for Sanger,
		higher for 454 (up to
		<span class="underline">27</span>) and highest for
		Solexa (<span class="underline">31</span>). Ion Torrent
		has at the moment <span class="underline">17</span>,
		but this may change in the future to somewhat higher values.
	      </p><p>
		This parameter defines the minimum number of bases at each end
		of a read that should be free of any sequencing errors.
	      </p></dd><dt><span class="term">
	      [pec_minimum_kmer_forward_reverse(pmkfr)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p>
		Default is is dependent on technology and quality in the --job
		switch: usually
		between <span class="underline">1</span>
		and <span class="underline">3</span>
		when  [-CL:pec=yes].
	      </p><p>
		This parameter defines the minimum number of occurrence of a
		kmer at each end of a read that should be free of any
		sequencing errors.
	      </p></dd><dt><span class="term">
	      [rare_kmer_mask(rkm)=<em class="replaceable"><code>on|y[es]|t[rue],
	      off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is is dependent on --job switch: currently
		it's <span class="underline">yes</span> for Solexa data
		and <span class="underline">no</span> otherwise.  If
		this parameter is active, MIRA will completely mask with 'X'
		those parts of a read which have kmer occurrence (in forward
		and reverse direction) less than the value specified
		via  [-CL:pmkfr].
	      </p><p>
		This is a quality ensuring move which improves assembly of
		ultra-high coverage contigs by cleaning out very likely, low
		frequency sequence dependent sequencing errors which passed
		all previous filters. The drawback is that very rare
		transcripts or very lowly covered genome parts with an
		occurrence less than the given value will also be masked
		out. However, Illumina gives so much data that this is almost
		never the case.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		This works only if [-CL:pec] is active.
	      </td></tr></table></div></dd><dt><span class="term">
	      [search_phix174(spx174)=<em class="replaceable"><code>on|y[es]|t[rue],
	      off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is <span class="emphasis"><em>on</em></span> for Illumina data, off
	      otherwise.
	      </p><p>
		PhiX 174 is a small phage of enterobacteria whose DNA is often
		spiked-in during Illumina sequencing to determine error rates
		in datasets and to increase complexity in low-complexity
		samples (amplicon, chipseq etc) to help in cluster
		identification.
	      </p><p>
		If it remains in the sequenced data, it has to be
		seen as a contaminant for projects working with organisms
		which should not contain the PhiX 174 phage.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		However, PhiX may be part of some genome sequences
		(enterobacteria). In these cases, the PhiX174 search will
		report genuine genome data.
	      </td></tr></table></div></dd><dt><span class="term">
	      [filter_phix174(fpx174)=<em class="replaceable"><code>on|y[es]|t[rue],
	      off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is <span class="emphasis"><em>on</em></span> for Illumina data in
	      EST (RNASeq) assemblies, off otherwise.
	      </p><p>
		If [-CL:spx174] is on and  [-CL:fpx174] also,
		MIRA will filter out as contaminants all reads which have
		PhiX174 sequence recognised.
	      </p><p>
		The default value of having the filtering on only for Illumina
		EST (RNASeq) data is a conservative approach: the overwhelming
		majority of RNASeq data will indeed not sequence some
		enterobacteria, so having PhiX174 containing reads thrown out
		is indeed a valid move. For genomes however, MIRA currently is
		cautious and will not filter these reads by default.
	      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		However, PhiX may be part of some genome sequences
		(enterobacteria). In these cases, the PhiX174 filter will
		remove reads from valid genome or expression data.
	      </td></tr></table></div></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_skim_sk"></a>3.4.4.10. 
	  Parameter group: -SKIM (-SK)
	</h4></div></div></div><p>
	  Options that control the behaviour of the initial fast all-against-all read
	  comparison algorithm. Matches found here will be confirmed later in the
	  alignment phase. The new SKIM3 algorithm that is in place since version 2.7.4
	  uses a kmer based algorithm that works similarly to SSAHA (see Ning Z, Cox AJ,
	  Mullikin JC; "SSAHA: a fast search method for large DNA databases."; Genome
	  Res. 2001;11;1725-9).
	</p><p>
	  The major differences of SKIM3 and SSAHA are:
	</p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	      the word length <span class="emphasis"><em>n</em></span> of a kmer (hash) in
	      SSAHA2 must be &lt; 15, but can be up to 32 bases in 64 bit
	      versions of MIRA &lt; 4.0.2 and lower, and up to 256 bases for
	      higher versions of MIRA.
	    </p></li><li class="listitem"><p>
	      SKIM3 uses a maximum fixed amount of RAM that is independent of
	      the word size. E.g., SSAHA would need 4 <span class="underline">exabyte</span> to work with word length of
	      30 bases ... SKIM3 just takes a couple of hundred MB.
	    </p></li></ol></div><p>
	  The parameters for SKIM3:
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [number_of_threads(not)=<em class="replaceable"><code>integer &#8805; 1</code></em>]
	    </span></dt><dd><p>
		Number of threads used in SKIM, default is <span class="underline">2</span>. A few parts of SKIM are
		non-threaded, so the speedup is not exactly linear, but it
		should be very close. E.g., with 2 processors I get a speedup
		of 180-195%, with 4 between 350 and 395%.
	      </p><p>
		Although the main data structures are shared between the
		threads, there's some additional memory needed for each
		thread.
	      </p></dd><dt><span class="term">
	      [also_compute_reverse_complements(acrc)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">on</span>. Defines
		whether SKIM searches for matches only in forward/forward
		direction or whether it also looks for forward/reverse
		direction.
	      </p><p>
		You usually will not want to touch the default, except for very
		special application cases where you do not want MIRA to use
		reverse complement sequences at all.
	      </p></dd><dt><span class="term">
	      [kmer_size(kms)=<em class="replaceable"><code>10 &lt; integer &#8804; 256</code></em>]
	    </span></dt><dd><p>
		Defaults are dependent on "--job" switch and sequencing
		technologies used.
	      </p><p>
		Controls the number of consecutive bases
		<span class="emphasis"><em>n</em></span> which are used as a kmer. The
		higher the value, the faster the search. The lower the value,
		the slower the search and the more weak matches are found.
	      </p><p>
		A secondary effect of this parameter is the estimation of MIRA
		on whether stretches within a read sequence are repetitive or
		not. Large values of [-SK:kms] allow a better
		distinction between "almost identical" repeats early in the
		assembly process and, given enough coverage, generally lead to
		less and longer contigs.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		This parameter gets overriden by the one-stop-shop parameter
		[-AS:kms] which determines number of passes and kmer
		size to use in each pass.
	      </td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		For de-novo assemblies, values below 15 are not
		recommended. For mapping assemblies, values below 10 should
		not be used.
	      </td></tr></table></div></dd><dt><span class="term">
	      [kmer_save_stepping(kss)=<em class="replaceable"><code>integer &#8805; 1</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">1</span>. This is a parameter
	      controlling the stepping increment <span class="emphasis"><em>s</em></span> with which kmers are
	      generated. This allows for more or less fine grained search as
	      matches are found with at least <span class="emphasis"><em>n+s</em></span> (see  [-SK:kms])
	      equal bases. The higher the value, the faster the search. The
	      lower the value, the more weak matches are found.
	      </p></dd><dt><span class="term">
	      [percent_required(pr)=<em class="replaceable"><code>integer &#8805; 1</code></em>]
	    </span></dt><dd><p> Default is dependent of the sequencing technology used
	      and assembly quality wished. Controls the relative percentage of
	      exact word matches in an approximate overlap that has to be
	      reached to accept this overlap as possible match. Increasing
	      this number will decrease the number of possible alignments that
	      have to be checked by Smith-Waterman later on in the assembly,
	      but it also might lead to the rejection of weaker overlaps (i.e.
	      overlaps that contain a higher number of mismatches).
	      </p><p>
		Note: most of the time it makes sense to keep this parameter
		in sync with [-AL:mrs].
	      </p></dd><dt><span class="term">
	      [maxhits_perread(mhpr)=<em class="replaceable"><code>integer &#8805; 1</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">2000</span>. Controls the maximum
	      number of possible hits one read can maximally transport to the
	      overlap edge reduction phase. If more potential hits are found,
	      only the best ones are taken.
	      </p><p>
		In the pre-2.9.x series, this was an important option for
		tackling projects which contain <span class="emphasis"><em>extreme</em></span>
		assembly conditions. It still is if you run out of memory in
		the graph edge reduction phase. Try then to lower it to 1000,
		500 or even 100.
	      </p><p>
		As the assembly increases in passes ([-AS:nop]),
		different combinations of possible hits will be checked,
		always the probably best ones first. So the accuracy of the
		assembly should only suffer when lowering this number too
		much.
	      </p></dd><dt><span class="term">
	      [filter_megahubs(fmh)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">yes</span>. Defines whether megahubs (reads
	      with extremely many overlaps to other reads) are filtered.
	      See also  [-SK:mhc:mmhr].
	      </p></dd><dt><span class="term">
	      [megahub_cap(mhc)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">150000</span>. Defines the number of kmer
	      overlaps a read may have before it is categorised as megahub.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		You basically don't want to mess with this one. Except for
		assemblies containing very long reads. Rule of thumb: you
		might want to multiply the 150k value by n where n is the
		average read length divided by 2000. Don't overdo, max n at 15
		or so.
	      </td></tr></table></div></dd><dt><span class="term">
	      [max_megahub_ratio(mmhr)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">0</span>. If the number of reads
	      identified as megahubs exceeds the allowed ratio, MIRA will
	      abort.
	      </p><p>
		This is a fail-safe parameter to avoid assemblies where things
		look fishy. In case you see this, you might want to ask for
		advice on the mira_talk mailing list. In short: bacteria
		should never have megahubs (90% of all cases reported were
		contamination of some sort and the 10% were due to incredibly
		high coverage numbers). Eukaryotes are likely to contain
		megahubs if filtering is [-KS:mnr] not on.
	      </p><p>
		EST project however, especially from non-normalised libraries,
		will very probably contain megahubs. In this case, you might
		want to think about masking, see [-KS:mnr].
	      </p></dd><dt><span class="term">
	      [sw_check_on_backbones(swcob)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is currently (3.4.0) <span class="underline">yes</span> for accurate mapping
		jobs. Takes effect only in mapping assemblies. Defines whether
		SKIM hits against a backbone (reference) sequence with less
		than 100% identity are double checked with Smith-Waterman to
		improve mapping accuracy.
	      </p><p>
		You will want to set this option to <span class="underline">yes</span> whenever your reference
		sequence contains more complex or numerous repeats and your
		data has SNPs in those areas.
	      </p></dd><dt><span class="term">
	      [max_kmers_in_memory(mkim)=<em class="replaceable"><code>integer &#8805; 100000</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">15000000</span>. Has no influence
	      on the quality of the assembly, only on the maximum memory size
	      needed during the skimming. The default value is equivalent to
	      approximately 500MB.
	      </p><p>
		Note: reducing the number will increase the run time, the more drastically
		the bigger the reduction. On the other hand, increasing the default value
		chosen will not result in speed improvements that are really noticeable. In
		short: leave this number alone if you are not desperate to save a few MB.
	      </p></dd><dt><span class="term">
	      [memcap_hitreduction(mchr)=<em class="replaceable"><code>integer &#8805; 10</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">1024</span>, <span class="underline">2048</span>
	      when Solexa sequences are used. Maximum memory used (in MiB)
	      during the reduction of skim hits.
	      </p><p>
		Note: has no influence on the quality of the assembly,
		reducing the number will increase the runtime, the more
		drastically the bigger the reduction as hits then must be
		streamed multiple times from disk.
	      </p><p>
		The default is good enough for assembly of bacterial genomes
		or small eukaryotes (using Sanger and/or 454 sequences). As
		soon as assembling something bigger than 20 megabases, you
		should increase it to 2048 or 4096 (equivalent to 2 or 4 GiB
		of memory).
	      </p></dd></dl></div><p>
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_hashstatistics_hs"></a>3.4.4.11. 
	  Parameter group: -KMERSTATISTICS (-KS)
	</h4></div></div></div><p>
	  Hash statistics (nowadays called kmer statistics in literature
	  or other software packages) allows to quickly assess reads from a
	  coverage point of view without actually assembling the reads. MIRA
	  uses this as a quick pre-assembly evaluation to find and tag reads
	  which are from repetitive and non-repetitive parts of a project.
	</p><p>
	  The length of the kmer is defined via [-SK:kms]
	  or  [-AS:kms] while the parameters in this section define
	  the boundaries of the different repeat levels.
	</p><p>
	  A more in-depth description on kmer statistics is given in the
	  sections <span class="emphasis"><em>Introduction to 'masking'</em></span>
	  and <span class="emphasis"><em>How does 'nasty repeat' masking work?</em></span> in
	  the chapter dealing with the assembly of hard projects.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [freq_est_minnormal(fenn)=<em class="replaceable"><code>float &gt; 0</code></em>]
	    </span></dt><dd><p>
		During kmer statistics analysis, MIRA will estimate how repetitive parts
		of reads are. Parts which are occurring less than
		[-KS:fenn] times the average occurrence will be tagged
		with a HAF2 (less than average) tag.
	      </p></dd><dt><span class="term">
	      [freq_est_maxnormal(fexn)=<em class="replaceable"><code>float &gt; 0</code></em>]
	    </span></dt><dd><p>
		During kmer statistics analysis, MIRA will estimate how repetitive parts
		of reads are. Parts which are occurring more than
		[-KS:fenn] but less than  [-KS:fexn] times
		the average occurrence will be tagged with a HAF3 (normal) tag.
	      </p></dd><dt><span class="term">
	      [freq_est_repeat(fer)=<em class="replaceable"><code>float &gt; 0</code></em>]
	    </span></dt><dd><p>
		During kmer statistics analysis, MIRA will estimate how repetitive parts
		of reads are. Parts which are occurring more than
		[-KS:fexn] but less than  [-KS:fer] times
		the average occurrence will be tagged with a HAF4 (above average) tag.
	      </p></dd><dt><span class="term">
	      [freq_est_heavyrepeat(fehr)=<em class="replaceable"><code>float &gt; 0</code></em>]
	    </span></dt><dd><p>
		During kmer statistics analysis, MIRA will estimate how repetitive parts
		of reads are. Parts which are occurring more than
		[-KS:fer] but less than  [-KS:fehr] times
		the average occurrence will be tagged with a HAF5 (repeat) tag.
	      </p></dd><dt><span class="term">
	      [freq_est_crazyrepeat(fecr)=<em class="replaceable"><code>float &gt; 0</code></em>]
	    </span></dt><dd><p>
		During kmer statistics analysis, MIRA will estimate how repetitive parts
		of reads are. Parts which are occurring more than
		[-KS:fehr] but less than  [-KS:fecr] times
		the average occurrence will be tagged with a HAF6 (heavy
		repeat) tag. Parts which are occurring more than
		 [-KS:fecr] but less than  [-KS:nrr] times the
		average occurrence will be tagged with a HAF7 (crazy repeat)
		tag.
	      </p></dd><dt><span class="term">
	      [mask_nasty_repeats(mnr)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is dependent on --job
		type: <span class="underline">yes</span> for
		de-novo, <span class="underline">no</span> for mapping.
	      </p><p>
		Tells MIRA to tag (during the kmer statistics phase) read
		subsequences of length [-SK:kms] nucleotides that
		appear more that X times more often than the median occurrence
		of subsequences would otherwise suggest. The threshold X from
		which on subsequences are considered nasty is set by
		 [-KS:nrr] or  [-KS:nrc], the action MIRA
		should take when encountering those sequences is defined
		by  [-KS:ldn] (see below).
	      </p><p>
		When not using lossless digital normalisation
		([-KS:ldn]), the tag used by MIRA will be "MNRr"
		which stands for "Mask Nasty Repeat in read". This tag has an
		active masking function in MIRA and the fast all-against-all
		overlap searcher (SKIM) will then completely ignore the tagged
		subsequences of reads. There's one drawback though: the
		smaller the reads are that you try to assemble, the higher the
		probability that your reads will not span nasty repeats
		completely, therefore leading to a abortion of contig building
		at this site. Reads completely covered by the MNRr tag will
		therefore land in the debris file as no overlap will be found.
	      </p><p>
		This option is extremely useful for assembly of larger
		projects (fungi-size) with a high percentage of repeats.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Although it is expected that bacteria will not really need
		this, leaving it turned on will probably not harm except in
		unusual cases like several copies of (pro-)phages integrated
		in a genome.
	      </td></tr></table></div></dd><dt><span class="term">
	      [nasty_repeat_ratio(nrr)=<em class="replaceable"><code>integer &#8805; 2</code></em>]
	    </span></dt><dd><p>
		Default is depending on the [--job=...]
		parameters. Normally it's high (around 100) for genome
		assemblies, but much lower (20 or less) for EST assemblies.
	      </p><p>
		Sets the ratio from which on subsequences are considered nasty
		and hidden from the kmer statistics overlapper with a
		<span class="emphasis"><em>MNRr</em></span> tag. E.g.: A value of 10 means: mask all
		k-mers of  [-SK:kms] length which are occurring more
		than 10 times more often than the average of the whole project.
	      </p></dd><dt><span class="term">
	      [nasty_repeat_coverage(nrc)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p>
		Default is depending on the [--job=...]
		parameters: <span class="underline">0</span> for genome
		assemblies, <span class="underline">200</span> for EST assemblies.
	      </p><p>
		Closely related to the [-KS:nrr] parameter (see
		above), but while the above works on ratios derived from a
		calculated average, this parameter allows to set an absolute
		value. Note that this parameter will take precedence
		over  [-KS:nrr] if the calculated value of nrr is
		larger that the absolute value given here. A value of 0
		de-activates this parameter.
	      </p></dd><dt><span class="term">
	      [lossless_digital_normalisation(ldn)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is dependent on --job
		type: <span class="underline">yes</span> for denovo
		EST/RNAseq assembly, <span class="underline">no</span>
		otherwise.
	      </p><p>
		Tells MIRA how to treat reads containing nasty repeats
		when [-KS:mnr] is active.
	      </p><p>
		When set to <span class="emphasis"><em>yes</em></span>, MIRA will apply a
		modified digital normalisation step to the reads, effectively
		decreasing the coverage of a given repetitive stretch down to
		a minimum needed to correctly represent one copy of the
		repeat. However, contrary to the published method, MIRA will
		keep enough reads of repetitive regions to also correctly
		reconstruct slightly different variants of the repeats present
		in the genome or EST / RNASeq data set, even if they differ in
		only a single base.
	      </p><p>
		The tag used by MIRA to denote stretches which may have
		contributed to the digital normalisation will be
		"DGNr". Additionally, contigs which contain reads completely
		covered by a DGNr tag will get an additional "_dn" as part of
		their name to show that they contain read representatives for
		digital normalisation. E.g.: "contig_dn_c1".
	      </p><p>
		This option is extremely useful for non-normalised EST /
		RNASeq projects, to get at least the sequence of
		overrepresented transcripts assembled even if the coverage
		values then cannot be interpreted as expression values
		anymore.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		The lossless digital normalisation will be applied as soon as
		the kmer size of the active pass (see [-AS:kms])
		reaches a size of at least 50 or, at the latest, in the second
		to last pass.
	      </td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Once digital normalisation has been applied, the
		parameters [-KS:nrr] and  [-KS:nrc] do not
		take effect anymore.
	      </td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		The effect of lossless digital normalisation on genome data
		has not been studied sufficiently by me to approve it for
		genomes. Use with care in genome assemblies.
	      </td></tr></table></div></dd><dt><span class="term">
	      [repeatlevel_in_infofile(rliif)=<em class="replaceable"><code>integer; 0, 5-8</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">6</span>. Sets the
		minimum level of the HAF tags from which on MIRA will report
		tentatively repetitive sequence in the
		<code class="filename">*_info_readrepeats.lst</code> file of the info
		directory.
	      </p><p>
		A value of <span class="underline">0</span> means
		"switched off". The default value of <span class="underline">6</span> means all subsequences tagged
		with <span class="emphasis"><em>HAF6</em></span>, <span class="emphasis"><em>HAF7</em></span> and
		<span class="emphasis"><em>MNRr</em></span> will be logged. If you, e.g., only
		wanted MNRr logged, you'd use <span class="underline">8</span> as parameter value.
	      </p><p>
		See also [-KS:fenn:fexn:fer:fehr:mnr:nrr] to set the
		different levels for the <span class="emphasis"><em>HAF</em></span> and
		<span class="emphasis"><em>MNRr</em></span> tags.
	      </p></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_align_al"></a>3.4.4.12. 
	  Parameter group: -ALIGN (-AL)
	</h4></div></div></div><p>
	  The align options control the behaviour of the Smith-Waterman alignment
	  routines. Only read pairs which are confirmed here may be included into
	  contigs. Affects both the checking of possible alignments found by SKIM as
	  well as the phase when reads are integrated into a contig.
	</p><p>
	  Every option in this section can be set individually for every sequencing
	  technology, giving a very fine grained control on how reads are aligned for
	  each technology.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [bandwidth_in_percent(bip)=<em class="replaceable"><code>integer &gt; 0 and &#8804;100</code></em>]
	    </span></dt><dd><p> Default
	      is dependent of the sequencing technology used. The banded Smith-Waterman
	      alignment uses this percentage number to compute the bandwidth it has to use
	      when computing the alignment matrix. E.g., expected overlap is 150 bases,
	      bip=10 -&gt; the banded SW will compute a band of 15 bases to each side of
	      the expected alignment diagonal, thus allowing up to 15 unbalanced inserts /
	      deletes in the alignment.  INCREASING AND DECREASING THIS NUMBER:
	      <span class="emphasis"><em>increase</em></span>: will find more non-optimal alignments, but will also
	      increase SW runtime between linear and \Circum2.  <span class="emphasis"><em>decrease</em></span>: the other
	      way round, might miss a few bad alignments but gaining speed.
	      </p></dd><dt><span class="term">
	      [bandwidth_min(bmin)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default is dependent of the
	      sequencing technology used. Minimum bandwidth in bases to each side.
	      </p></dd><dt><span class="term">
	      [bandwidth_max(bmax)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default is dependent of the
	      sequencing technology used. Maximum bandwidth in bases to each side.
	      </p></dd><dt><span class="term">
	      [min_overlap(mo)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default is dependent of the
	      sequencing technology used. Minimum number of overlapping bases needed in an
	      alignment of two sequences to be accepted.
	      </p></dd><dt><span class="term">
	      [min_score(ms)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default is dependent of the
	      sequencing technology used. Describes the minimum score of an overlap to be
	      taken into account for assembly. MIRA uses a default scoring scheme
	      for SW align: each match counts 1, a match with an N counts 0, each mismatch
	      with a non-N base -1 and each gap -2. Take a bigger score to weed out a
	      number of chance matches, a lower score to perhaps find the single (short)
	      alignment that might join two contigs together (at the expense of computing
	      time and memory).
	      </p></dd><dt><span class="term">
	      [min_relative_score(mrs)=<em class="replaceable"><code>integer &gt; 0 and &#8804;100</code></em>]
	    </span></dt><dd><p> Default is dependent of the sequencing technology
	      used. Describes the min % of matching between two reads to be
	      considered for assembly. Increasing this number will save
	      memory, but one might loose possible alignments. I propose a
	      maximum of 80 here.  Decreasing below 55% will make memory and
	      time consumption probably explode.
	      </p><p>
		Note: most of the time it makes sense to keep this parameter
		in sync with
		[-SK:pr].
	      </p></dd><dt><span class="term">
	      [solexa_hack_max_errors(shme)=<em class="replaceable"><code>integer &gt; -1</code></em>]
	    </span></dt><dd><p>
		Currently a hack just for Solexa/Illumina data.
	      </p><p>
		When running in mapping mode, this defines the maximum number
		of mismatches and gaps a read may have compared to the
		reference to be allowed to map. The result is usually a much
		better mapping in areas with larger discrepancies between
		reference sequence and mapped data. Note that the mapping
		process takes longer if this value is unequal to 0 as MIRA
		will use iterative mapping which involves a certain amount of
		trial and error.
	      </p><p>
		The default value of <span class="underline">-1</span>
		lets MIRA choose this value automatically. It sets it to 15%
		of the average Illumina read lengths loaded.
	      </p><p>
		A value of <span class="underline">0</span> switches of
		this functionality, leading to a much faster mapping
		process. Useful when mapping expression data where coverage
		values may be more important than the best possible alignment.
	      </p></dd><dt><span class="term">
	      [extra_gap_penalty(egp)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology
		used. Defines whether or not to increase penalties applied to
		alignments containing long gaps. Setting this to 'yes' might
		help in projects with frequent repeats. On the other hand, it
		is definitively disturbing when assembling very long reads
		containing multiple long indels in the called base sequence
		... although this should not happen in the first place and is
		a sure sign for problems lying ahead.
	      </p><p>
		When in doubt, set it
		to <span class="underline">yes</span> for EST projects
		and de-novo genome assembly, set it
		to <span class="underline">no</span> for assembly of
		closely related strains (assembly against a backbone).
	      </p><p>
		When set to <span class="underline">no</span>, it is
		recommended to have  [-CO:amgb]
		and  [-CO:amgbemc] both set to yes.
	      </p></dd><dt><span class="term">
	      [egp_level(egpl)=<em class="replaceable"><code>comma separated list of integer &#8805; 0 and &#8804; 100</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology and job
		used. Has no effect if extra_gap_penalty is off.
	      </p><p>
		...
	      </p></dd><dt><span class="term">
	      [egp_level(megpp)=<em class="replaceable"><code>0 &#8804; integer &#8804; 100</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">100</span>. Has no effect if
	      extra_gap_penalty is off. Defines the maximum extra penalty in
	      percent applied to 'long' gaps.
	      
	      
	      
	      
	      
	      
	      
	      
	      
	      
	      
	      </p></dd></dl></div><p>
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_contig_co"></a>3.4.4.13. 
	  Parameter group: -CONTIG (-CO)
	</h4></div></div></div><p>
	  The contig options control the behaviour of the contig objects.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [name_prefix(np)=<em class="replaceable"><code>string</code></em>]
	    </span></dt><dd><p>
		Default is
		<span class="underline">&lt;projectname&gt;</span>. Contigs
		will have this string prepended to their names. Normally,
		the  [project=] line in the manifest will set this.
	      </p></dd><dt><span class="term">
	      [reject_on_drop_in_relscore(rodirs)=<em class="replaceable"><code>integer &#8805; 0 and &#8804;100</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology used.
	      </p><p>
		When adding reads to a contig, reject the reads if the drop in
		the minimum relative score of the alignment of the current
		consensus and the new read is &gt; the expected value
		calculated during the alignment phase. Lower values mean
		stricter checking.
	      </p><p>
		This value is doubled should a read be entered that has an
		assembled template partner (a read pair) at the right distance
		in the current contig.
	      </p></dd><dt><span class="term">
	      [cmin_relative_score(cmrs)=<em class="replaceable"><code>integer &#8805; -1 and &#8804;100</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">-1</span>. Works
		similarly to  [-AL:mrs], but during contig
		construction phase instead of read vs read alignment phase:
		describes the min % of matching between a read being added to
		a contig and the current contig consensus.
	      </p><p>
		If value is set to -1, then the value of [-AL:mrs] is used.
	      </p><p>
		Note: most of the time it makes sense to keep this parameter
		at -1. Else have it at
		approximately <span class="emphasis"><em>[-AL:mrs]-10</em></span> or
		switch it completely off via 0.
	      </p></dd><dt><span class="term">
	      [mark_repeats(mr)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">yes</span>. One of the most important switches in MIRA: if set to
	      <span class="underline">yes</span>, MIRA will try to resolve misassemblies due to repeats by
	      identifying single base stretch differences and tag those critical bases as
	      RMB (Repeat Marker Base, weak or strong). This switch is also needed when
	      MIRA is run in EST mode to identify possible inter-, intra- and
	      intra-and-interorganism SNPs.
	      </p></dd><dt><span class="term">
	      [only_in_result(mroir)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">no</span>. Only
	      takes effect when  [-CO:mr] (see above) is set
	      to <span class="underline">yes</span>. If set
	      to <span class="underline">yes</span>, MIRA will not use
	      the repeat resolving algorithm during build time (and therefore
	      will not be able to take advantage of this), but only before
	      saving results to disk.
	      </p><p>
		This switch is useful in some (rare) cases of mapping assembly.
	      </p></dd><dt><span class="term">
	      [assume_snp_instead_repeat(asir)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">no</span>.
	      Only takes effect when  [-CO:mr] (see above) is set to
	      <span class="underline">yes</span>, effect is also
	      dependent on the fact whether strain data (see
	      - [-SB:lsd]) is present or not.  Usually, MIRA will mark
	      bases that differentiate between repeats when a conflict occurs
	      between reads that belong to one strain. If the conflict occurs
	      between reads belonging to different strains, they are marked as
	      SNP. However, if this switch is set
	      to <span class="underline">yes</span>, conflict within a
	      strain are also marked as SNP.
	      </p><p>
		This switch is mainly used in assemblies of ESTs, it should
		not be set for genomic assembly.
	      </p></dd><dt><span class="term">
	      [min_reads_per_group(mrpg)=<em class="replaceable"><code>integer &#8805; 2</code></em>]
	    </span></dt><dd><p> Default is
	      dependent of the sequencing technology used. Only takes effect when
	      [-CO:mr] (see above) is set
	      to <span class="underline">yes</span>. This defines the
	      minimum number of reads in a group that are needed for the RMB
	      (Repeat Marker Bases) or SNP detection routines to be
	      triggered. A group is defined by the reads carrying the same
	      nucleotide for a given position, i.e., an assembly with mrpg=2
	      will need at least two times two reads with the same nucleotide
	      (having at least a quality as defined in  [-CO:mgqrt])
	      to be recognised as repeat marker or a SNP.  Setting this to a
	      low number increases sensitivity, but might produce a few false
	      positives, resulting in reads being thrown out of contigs
	      because of falsely identified possible repeat markers (or
	      wrongly recognised as SNP).
	      </p></dd><dt><span class="term">
	      [min_neighbour_qual (mnq)=<em class="replaceable"><code>integer &#8805;
	      10</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology used. Takes
		only effect when [-CO:mr] is set
		to <span class="underline">yes</span>. This defines the
		minimum quality of neighbouring bases that a base must have
		for being taken into consideration during the decision whether
		column base mismatches are relevant or not.
	      </p></dd><dt><span class="term">
	      [min_groupqual_for_rmb_tagging(mgqrt)=<em class="replaceable"><code>integer &#8805; 25</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology used. Takes
		only effect when [-CO:mr] is set
		to <span class="underline">yes</span>. This defines the
		minimum quality of a group of bases to be taken into account
		as potential repeat marker. The lower the number, the more
		sensitive you get, but lowering below 25 is not recommended as
		a lot of wrongly called bases can have a quality approaching
		this value and you'd end up with a lot of false positives. The
		higher the overall coverage of your project, the better, and
		the higher you can set this number. A value of 35 will
		probably remove most false positives, a value of 40 will
		probably never show false positives ... but will generate a
		sizable number of false negatives.
		
		
		
		
		
		
	      </p></dd><dt><span class="term">
	      [endread_mark_exclusion_area(emea)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p> Default is dependent of the sequencing technology
	      used. Takes only effect when [-CO:mr] is set to
	      <span class="underline">yes</span>. Using the end of
	      sequences of Sanger type shotgun sequencing is always a bit
	      risky, as wrongly called bases tend to crowd there or some
	      sequencing vector relics hang around. It is even more risky to
	      use these stretches for detecting possible repeats, so one can
	      define an exclusion area where the bases are not used when
	      determining whether a mismatch is due to repeats or not.
	      </p></dd><dt><span class="term">
	      [emea_set1_on_clipping_pec(emeas1clpec)=<em class="replaceable"><code>on|y[es]|t[rue],
	      off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>. When
		 [-CL:pec] is set, the end-read exclusion area can be
		considerably reduced. Setting this parameter will
		automatically do this.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Although the parameter is named "set to 1", it may be that the
		exclusion area is actually a bit larger (2 to 4), depending on
		what users will report back as "best" option.
	      </td></tr></table></div></dd><dt><span class="term">
	      [also_mark_gap_bases(amgb)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is dependent of the sequencing technology
		used. Determines whether columns containing gap bases (indels)
		are also tagged.
	      </p><p>
		Note: it is strongly recommended to not set this to 'yes' for
		454 type data.
	      </p></dd><dt><span class="term">
	      [also_mark_gap_bases_even_multicolumn(amgbemc)=<em class="replaceable"><code>on|y[es]|t[rue],
	      off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">yes</span>.
	      Takes effect only when  [-CO:amgb] is set to
	      <span class="underline">yes</span>. Determines whether multiple columns containing gap bases
	      (indels) are also tagged.
	      </p></dd><dt><span class="term">
	      [also_mark_gap_bases_need_both_strands(amgbnbs)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is <span class="underline">yes</span>.  Takes effect only when
	       [-CO:amgb] is set to <span class="underline">yes</span>. Determines whether both for
	      tagging columns containing gap bases, both strands.need to have a gap.
	      Setting this to <span class="underline">no</span> is not recommended except when working in
	      desperately low coverage situations.
	      </p></dd><dt><span class="term">
	      [force_nonIUPACconsensus_perseqtype(fnic)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>. If set to
		<span class="underline">yes</span>, MIRA will be forced
		to make a choice for a consensus base (A,C,G,T or gap) even in
		unclear cases where it would normally put a IUPAC base. All
		other things being equal (like quality of the possible
		consensus base and other things), MIRA will choose a base by
		either looking for a majority vote or, if that also is not
		clear, by preferring gaps over T over G over C over finally A.
	      </p><p>
		MIRA makes a considerable effort to deduce the right base at
		each position of an assembly. Only when cases begin to be
		borderline it will use a IUPAC code to make you aware of
		potential problems. It
		is <span class="bold"><strong>suggested</strong></span> to leave this
		option to <span class="underline">no</span> as IUPAC
		bases in the consensus are a sign that - if you need 100%
		reliability - you really should have a look at this particular
		place to resolve potential problems. You might want to set
		this parameter to yes in the following cases: 1) when your
		tools that use assembly result cannot handle IUPAC bases and
		you don't care about being absolutely perfect in your data (by
		looking over them manually). 2) when you assemble data without
		any quality values (which you should not do anyway), then this
		method will allow you to get a result without IUPAC bases that
		is "good enough" with respect to the fact that you did not
		have quality values.
	      </p></dd><dt><span class="term">
	      [merge_short_reads(msr)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span> for all
		Solexas when in a mapping assembly, else it's <span class="underline">no</span>. Can only be used in mapping
		assemblies. If set to <span class="underline">yes</span>, MIRA will merge all perfectly
		mapping Solexa reads into longer reads (Coverage Equivalent
		Reads, CERs) while keeping quality and coverage information
		intact.
	      </p><p>
		This feature hugely reduces the number of Solexa reads and
		makes assembly results with Solexa data small enough to be
		handled by current finishing programs (gap4, consed, others)
		on normal workstations.
	      </p></dd><dt><span class="term">
	      [msr_keepcontigendsunmerged(msrme)=<em class="replaceable"><code>integer &#8805; 0</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">0</span> for all
		Solexas when in a mapping assembly. Takes only effect in
		mapping assemblies if  [-CO:msr=yes].
	      </p><p>
		Defines how many "errors" (i.e. differences) a read may have
		to be merged into a coverage equivalent read. Useful only when
		one does not need SNP information from an assembly but wants
		to concentrate either on coverage data or on paired-end
		information at contig ends.
	      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		This feature allows to merge non-perfect reads, which makes
		most SNP information simply disappear from the alignment. Use
		with care!
	      </td></tr></table></div></dd><dt><span class="term">
	      [msr_keepcontigendsunmerged(msrkceu)=<em class="replaceable"><code>-1, integer &gt; 0</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">-1</span> for all
		Solexas when in a mapping assembly. Takes only effect in
		mapping assemblies if  [-CO:msr=yes] and for reads
		which have a paired-end / mate-pair partner actively used in
		the assembly.
	      </p><p>
		If set to a value &gt; 0, MIRA will not merge paired-end /
		mate-pair reads if they map within the given distance of a
		contig end of the original reference sequence
		(backbone). Instead of a fixed value, one can also use
		-1. MIRA will then automatically not merge reads if the
		distance from the contig end is within the maximum size of the
		template insert size of the sequencing library for that read
		(either given via [-GE:tismax] or via XML TRACEINFO
		for the given read).
	      </p><p>
		This feature allows to use the data reduction from
		[-CO:msr] while enabling the result of such a mapping
		to be useful in subsequent scaffolding programs to order
		contigs.
	      </p></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_edit_ed"></a>3.4.4.14. 
	  Parameter group: -EDIT (-ED)
	</h4></div></div></div><p>
	  General options for controlling the integrated automatic editor. The editors
	  generally make a good job cleaning up alignments from typical sequencing
	  errors like (like base overcalls etc.). However, they may prove tricky in
	  certain situations:
	</p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	      in EST assemblies, they may edit rare transcripts toward almost
	      identical, more abundant transcripts. Usage must be carefully weighed.
	    </p></li><li class="listitem"><p>
	      the editors will not only change bases, but also sometimes delete or
	      insert non-gap bases as needed to improve an alignment when facts (trace
	      signals or other) show that this is what should have been the
	      sequence. However, this can make post processing of assembly results pretty
	      difficult with some formats like ACE, where the format itself contains no
	      way to specify certain edits like deletion. There's nothing one can do about
	      it and the only way to get around this problem is to use file formats with
	      more complete specifications like CAF, MAF (and BAF once supported by MIRA).
	    </p></li></ul></div><p>
	</p><p>
	  The following edit parameters are supported:
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [_mira_automatic_contig_editing(mace)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>. When set
		to yes, MIRA will use built-in versions of own automatic
		contig editors (see parameters below) to improve alignments.
	      </p></dd><dt><span class="term">
	      [edit_kmer_singlets(eks)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span> for all
		sequencing technologies, but only takes effect
		if  [-ED:mace] is on (see above).
	      </p><p>
		When set to yes, MIRA uses the alignment information of a
		complete contig at places with sequencing errors which lead to
		unique kmers and correct the error according to the alignment.
	      </p><p>
		This is an extremely conservative yet very effective editing
		strategy and can therefore be kept always activated.
	      </p></dd><dt><span class="term">
	      [edit_homopolymer_overcalls(ehpo)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span> for 454
		and Ion Torrent, but only takes effect if  [-ED:mace]
		is on (see above).
	      </p><p>
		When set to yes, MIRA use the alignment information of a
		complete contig at places with potential homopolymer
		sequencing errors and correct the error according to the
		alignment.
	      </p><p>
		This editor should be switched on only for sequencing
		technologies with known homopolymer sequencing problems. That
		is: currently only 454 and Ion.
	      </p></dd><dt><span class="term">
	      [edit_automatic_contig_editing(eace)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>. When set
		to yes, MIRA will use built-in versions of the "EdIt"
		automatic contig editor (see parameters below) to correct
		sequencing errors in Sanger reads.
	      </p><p>
		EdIt will try to resolve discrepancies in the contig by
		performing trace signal analysis and correct even hard to resolve
		errors.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		The current development version has a memory leak in
		this editor, therefore the option cannot be turned
		on.
	      </td></tr></table></div></dd><dt><span class="term">
	      [strict_editing_mode(sem)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>. Only for
		Sanger data. If set to yes, the automatic editor will not take
		error hypotheses with a low probability into account, even if
		all the requirements to make an edit are fulfilled.
	      </p></dd><dt><span class="term">
	      [confirmation_threshold(ct)=<em class="replaceable"><code>integer, 0 &lt; x &#8804; 100</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">50</span>. Only for
		Sanger data. The higher this value, the more strict the
		automatic editor will apply its internal rule set. Going below
		40 is not recommended.
	      </p></dd></dl></div><p>
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_misc_mi"></a>3.4.4.15. 
	  Parameter group: -MISC (-MI)
	</h4></div></div></div><p>
	  Options which would not fit elsewhere.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [iknowwhatido(ikwid)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>. This
		switch tells MIRA that you know what you do in some
		situations and force it not to stop when it thinks something is
		really wrong, but simply continue.
	      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		You generally should not to set this flag except in cases
		where MIRA stopped and the warning / error message told you to
		get around that very specific problem by setting this flag.
	      </td></tr></table></div></dd><dt><span class="term">
	      [large_contig_size(lcs)=<em class="replaceable"><code>integer &lt;
	      0</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">500</span>. This
		parameter has absolutely no influence whatsoever on the
		assembly process of MIRA. But is used in the reporting within
		the <code class="filename">*_assembly_info.txt</code> file after the
		assembly where MIRA reports statistics on
		<span class="emphasis"><em>large</em></span> contigs and
		<span class="emphasis"><em>all</em></span> contigs.  [-MI:lcs] is the
		threshold value for dividing the contigs into these two
		categories.
	      </p></dd><dt><span class="term">
	      [large_contig_size_for_stats(lcs4s)=<em class="replaceable"><code>integer &lt;
	      0</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">5000</span> for
		 [--job=genome] and <span class="underline">1000</span> for  [--job=est].
	      </p><p>
		This parameter is used for internal statistics calculations
		and has a subtle influence when being in a
		[--job=genome] assembly mode.
	      </p><p>
		MIRA uses coverage information of an assembly project to find
		out about potentially repetitive areas in reads (and thus, a
		genome). To calculate statistics which are reflecting the
		approximate truth regarding the average coverage of a genome,
		the "large contig size for stats" value of
		[-MI:lcs4s] is used as a cutoff threshold: contigs
		smaller than this value do not contribute to the calculation
		of average coverage while contigs larger or equal to this
		value do.
	      </p><p>
		This reflects two facts: on the one hand - especially with
		short read sequencing technologies and in projects without
		read pair libraries - contigs containing predominantly
		repetitive sequences are of a relatively small size. On the
		other hand, reads which could not be placed into contigs
		(maybe due to a sequencing technology dependent motif error)
		often enough form small contigs with extremely low
		coverage.
	      </p><p>
		 It should be clear that one does not want any of the above
		 when calculating average coverage statistics and having this
		 cutoff discards small contigs which tend to muddy the
		 picture. If in doubt, don't touch this parameter.
	      </p></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_misc_nw"></a>3.4.4.16. 
	  Parameter group: -NAG_AND_WARN (-NW)
	</h4></div></div></div><p>
	  Parameters which let MIRA warn you about unusual things or potential
	  problems. The flags in this parameter section come in three
	  flavours: <span class="emphasis"><em>stop</em></span>, <span class="emphasis"><em>warn</em></span> and
	  <span class="emphasis"><em>no</em></span> which let MIRA either stop, give a warning
	  or do nothing if a specific problem is detected.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [check_nfs(cnfs)=<em class="replaceable"><code>stop|warn|no</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">stop</span>. MIRA
		will check whether the tmp directory is running on a NFS
		mount.
	      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top"><p>
		  You should never ever at all run MIRA on a NFS mounted
		  directory ... or face the the fact that the assembly process
		  may very well take 5 to 10 times longer (or more) than
		  normal. You have been warned.
		</p><p>
		  The reason for the slowdown is the same as why one should
		  never run a BLAST search on a big database being located on
		  a NFS volume: access via network is terribly slow when
		  compared to local disks, at least if you have not invested a
		  lot of money into specialised solutions.
		</p></td></tr></table></div></dd><dt><span class="term">
	      [check_duplicate_readnames(cdrn)=<em class="replaceable"><code>stop|warn|no</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">stop</span>. MIRA
		will check for duplicate read names after loading.
	      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top"><p>
		  Duplicate read names usually hint to a serious problem with
		  your input and should really, really be fixed. You can
		  choose to ignore this error by switching off this flag, but
		  this will almost certainly lead to problems with result
		  files (ACE and CAF for sure, maybe also SAM) and probably to
		  other unexpected effects.
		</p></td></tr></table></div></dd><dt><span class="term">
	      [check_template_problems(ctp)=<em class="replaceable"><code>stop|warn|no</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">stop</span>. MIRA
		will check read template naming after loading.
	      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top"><p>
		  Problems in read template naming point to problems with read
		  names or to broken template information. You should try to
		  find the cause of the problem instead of ignoring this error
		  message.
		</p></td></tr></table></div></dd><dt><span class="term">
	      [check_maxreadnamelength(cmrnl)=<em class="replaceable"><code>stop|warn|no</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">stop</span>. MIRA
		will check whether the length of the names of your reads
		surpass the given number of characters (see  [-NW:mrnl]).
	      </p><p>
		While MIRA and many other programs have no problem with long read names,
		some older programs have restrictions concerning the length of
		the read name. Example given: the pipeline <code class="literal">CAF -&gt;
		caf2gap -&gt; gap2caf</code> will stop working at
		the <span class="command"><strong>gap2caf</strong></span> stage if there are read names
		having &gt; 40 characters where the names differ only at &gt;40
		characters.
	      </p><p>
		This should be a warning only, but as a couple of people were
		bitten by this, the default behaviour of MIRA is to stop when
		it sees that potential problem. You might want to rename your
		reads to have &#8804; 40 characters.
	      </p><p>
		On the other hand, you also can ignore this potential problem
		and force MIRA to continue by using the parameter:
		[-NW:cmrnl=warn] or  [-NW:cmrnl=no]
	      </p></dd><dt><span class="term">
	      [maxreadnamelength(mrnl)=<em class="replaceable"><code>integer &#8805;
	      0</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">40</span>. This
		defines the effective check length for  [-NW:cmrnl].
	      </p></dd><dt><span class="term">
	      [check_average_coverage(cac)=<em class="replaceable"><code>stop|warn|no</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">stop</span>. In
		genome de-novo assemblies, MIRA will perform checks early in
		the assembly process whether the average coverage to be
		expected exceeds a given value (see  [-NW:acv]).
	      </p><p>
		With todays' sequencing technologies (especially Illumina, but
		also Ion Torrent and 454), many people simply take everything
		they get and throw it into an assembly. Which, in the case of
		Illumina and Ion, can mean they try to assemble their organism
		with a coverage of 100x, 200x and more (I've seen trials with
		more than 1000x).
	      </p><p>
		This is not good. Not. At. All! For two reasons (well, three
		to be precise).
	      </p><p>
		The first reason is that, usually, one does not sequence a
		single cell but a population of cells. If this population is
		not clonal (i.e., it contains subpopulations with genomic
		differences with each other), assemblers will be able to pick
		up these differences in the DNA once a certain sequence count
		is reached and they will try reconstruct a genome containing
		all clonal variations, treating these variations as potential
		repeats with slightly different sequences. Which, of course,
		will be wrong and I am pretty sure you do not want that.
	      </p><p>
		The second and way more important reason is that none of the
		current sequencing technologies is completely error free. Even
		more problematic, they contain both random and non-random
		sequencing errors. Especially the latter can become a big
		hurdle if these non-random errors are so prevalent that they
		suddenly appear to be valid sequence to an assembler. This in
		turn leads to false repeat detection, hence possibly contig
		breaks or even wrong consensus sequence. You don't want that,
		do you?
	      </p><p>
		The last reason is that overlap based assemblers (like MIRA
		is) need <span class="emphasis"><em>exponentially</em></span> more time and
		memory when the coverage increases. So keeping the coverage
		comparatively low helps you there.
	      </p></dd><dt><span class="term">
	      [average_coverage_value(acv)=<em class="replaceable"><code>integer &#8805;
	      0</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">80</span> for
		de-novo assemblies, in mapping assemblies it is 120 for Ion
		Torrent and 160 for Illumina data (might change in
		future). This defines the effective coverage to check for in
		 [-NW:cac].
	      </p></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_directory_dir_di"></a>3.4.4.17. 
	  Parameter group: -DIRECTORY (-DIR, -DI)
	</h4></div></div></div><p>
	  General options for controlling where to find or where to write data.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [tmp_redirected_to(trt)=<em class="replaceable"><code>&lt;directoryname&gt;</code></em>]
	    </span></dt><dd><p>
		Default is an empty string. When set to a non-empty string,
		MIRA will create the MIRA-temporary directory at the given
		location instead of using the current working directory.
	      </p><p>
		This option is particularly useful for systems which have
		solid state disks (SSDs) and some very fast disk subsystems
		which can be used for temporary files. Or in projects where
		the input and output files reside on a NFS mounted directory
		(current working dir), to put the tmp directory somewhere
		outside the NFS (see also: Things you should not do).
	      </p><p>
		In both cases above, and for larger projects, MIRA then runs
		a lot faster.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Prior to MIRA 4.0rc2, users had to make sure themselves that
		the target directory did not already exist. MIRA now handles
		this automatically by creating directory names with a random
		substring attached.
	      </td></tr></table></div></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_output_out"></a>3.4.4.18. 
	  Parameter group: -OUTPUT (-OUT)
	</h4></div></div></div><p>
	  Options for controlling which results to write to which type of files.
	  Additionally, a few options allow output customisation of textual
	  alignments (in text and HTML files).
	</p><p>
	  There are 3 types of results: result, temporary results and extra
	  temporary results. One probably needs only the results. Temporary
	  and extra temporary results are written while building different
	  stages of a contig and are given as convenience for trying to find
	  out why MIRA set some RMBs or disassembled some contigs.
	</p><p>
	  Output can be generated in these formats: CAF, Gap4 Directed
	  Assembly, FASTA, ACE, TCS, WIG, HTML and simple text.
	</p><p>
	  Naming conventions of the files follow the rules described in
	  section <span class="bold"><strong>Input / Output</strong></span>, subsection
	  <span class="bold"><strong>Filenames</strong></span>.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [savesimplesingletsinproject(sssip)=<em class="replaceable"><code>on|y[es]|t[rue],off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>. Controls
		whether 'unimportant' singlets are written to the result
		files.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Note that a value larger 1 of the [-AS:mrpc]
		parameter will disable the function of this parameter.
	      </td></tr></table></div></dd><dt><span class="term">
	      [savetaggedsingletsinproject(stsip)=<em class="replaceable"><code>on|y[es]|t[rue],off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default
		is <span class="underline">yes</span>. Controls whether
		singlets which have certain tags (see below) are written to
		the result files, even if  [-OUT:sssip] (see above) is
		set.
	      </p><p>
		If one of the (SRMr, CRMr, WRMr, SROr, SAOr, SIOr) tags
		appears in a singlet, MIRA will see that the singlets had been
		part of a larger alignment in earlier passes and even was part
		of a potentially 'important' decision. To give the possibility
		to human finishers to trace back the decision, these singlets
		can be written to result files.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Note that a value larger 1 of the [-AS:mrpc]
		parameter will disable the function of this parameter.
	      </td></tr></table></div></dd><dt><span class="term">
	      [remove_rollover_tmps(rrot)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default
	      is <span class="underline">yes</span>. Removes log and
	      temporary files once they should not be needed anymore during
	      the assembly process.
	      </p></dd><dt><span class="term">
	      [remove_tmp_directory(rtd)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default
	      is <span class="underline">no</span>. Removes the
	      complete tmp directory at the end of the assembly process. Some
	      logs and temporary files contain useful information that you may
	      want to analyse though, therefore the default of MIRA is not to
	      delete it.
	      </p></dd><dt><span class="term">
	      [output_result_caf(orc)=<em class="replaceable"><code>on|y[es]|t[rue],
	      off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">yes</span>.
	      </p></dd><dt><span class="term">
	      [output_result_maf(orm)=<em class="replaceable"><code>on|y[es]|t[rue],
	      off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">yes</span>.
	      </p></dd><dt><span class="term">
	      [output_result_gap4da(org)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		If set to <span class="underline">yes</span>, MIRA will
		automatically switch back
		to <span class="underline">no</span> (and cannot be
		forced to 'yes') when 454 or Solexa reads are present in the
		project as this ensure that the file system does not get
		flooded with millions of files.
	      </td></tr></table></div></dd><dt><span class="term">
	      [output_result_fasta(orf)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>.
	      </p></dd><dt><span class="term">
	      [output_result_ace(ora)=<em class="replaceable"><code>on|y[es]|t[rue],
	      off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default
	      is <span class="underline">no</span>.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
		  The ACE output of MIRA is conforming to the file
		  specification given in the consed documentation. However,
		  due to a bug in consed, consed cannot correctly load tags
		  set by MIRA.
		</p><p>
		  There is a workaround: the MIRA distribution comes with a
		  small Tcl script <span class="command"><strong>fixACE4consed.tcl</strong></span>
		  which implements a workaround to allow consed loading the
		  ACE generated by MIRA. Use the script like this:
		</p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>fixACE4consed.tcl <em class="replaceable"><code>infile.ace</code></em> &gt;<em class="replaceable"><code>outfile.ace</code></em></code></strong></pre><p>
		  and then load the resulting outfile into consed.
		</p></td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		ACE is the least suited file format for NGS data. Use it only
		when absolutely necessary.
	      </td></tr></table></div></dd><dt><span class="term">
	      [output_result_txt(ort)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_result_tcs(ors)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">yes</span>.
	      </p></dd><dt><span class="term">
	      [output_result_html(orh)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default
	      is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_tmpresult_caf(otc)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_tmpresult_maf(otm)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_tmpresult_gap4da(otg)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_tmpresult_fasta(otf)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_tmpresult_ace(ota)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_tmpresult_txt(ott)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_result_tcs(ots)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default
	      is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_tmpresult_html(oth)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_exttmpresult_caf(oetc)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_exttmpresult_gap4da(oetg)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_exttmpresult_fasta(oetf)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_exttmpresult_ace(oeta)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_exttmpresult_txt(oett)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [output_exttmpresult_html(oeth)=<em class="replaceable"><code>on|y[es]|t[rue], off|n[o]|f[alse]</code></em>]
	    </span></dt><dd><p>
		Default is <span class="underline">no</span>.
	      </p></dd><dt><span class="term">
	      [text_chars_per_line(tcpl)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">60</span>. When producing an output in text format
	      ( [-OUT:ort|ott|oett]), this parameter defines how many bases
	      each line of an alignment should contain.
	      </p></dd><dt><span class="term">
	      [html_chars_per_line(tcpl)=<em class="replaceable"><code>integer &gt; 0</code></em>]
	    </span></dt><dd><p> Default is
	      <span class="underline">60</span>. When producing an output in HTML format,
	      ( [-OUT:orh|oth|oeth]), this parameter defines how many bases
	      each line of an alignment should contain.
	      </p></dd><dt><span class="term">
	      [text_endgap_fillchar(tegfc)=<em class="replaceable"><code>&lt;single character&gt;</code></em>]
	    </span></dt><dd><p> Default
	      is <span class="underline"> </span> (a blank). When producing an output in text format
	      ( [-OUT:ort|ott|oett]), endgaps are filled up with this
	      character.
	      </p></dd><dt><span class="term">
	      [html_endgap_fillchar(hegfc)=<em class="replaceable"><code>&lt;single character&gt;</code></em>]
	    </span></dt><dd><p> Default
	      is <span class="underline"> </span> (a blank). When producing an output in HTML format
	      ( [-OUT:orh|oth|oeth]), end-gaps are filled up with this
	      character.
	      </p></dd></dl></div><p>
	</p></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_resuming_assemblies"></a>3.5. 
      Resuming / restarting assemblies
    </h2></div></div></div><p>
      It may happen that a MIRA run is interrupted - sometimes rather harshly
      - due to events more or less outside your control like, e.g., power
      failures, machine shutdowns for maintenance, missing disk space,
      run-time quotas etc. This may be less of a problem when assembling or
      mapping small data sets with run times between a couple of minutes up to
      a few hours, but becomes a nuisance for larger data sets like in small
      eukaryotes or RNASeq samples where the run time is measured in days.
    </p><p>
      If this happens in de-novo assemblies, MIRA has
      a <span class="emphasis"><em>resume</em></span> functionality: at predefined points in the
      assembly process, MIRA writes out special files to disk which enables it
      to resume the assembly at the point where these files were
      written. Starting MIRA in resume mode is pretty easy: simply add the
      resume flag  [-r] on a command line like this:
    </p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>mira -r ...</code></strong></pre><p>
      where the ellipsis ("...") above stands for the rest of the command line you would have used to start a new assembly.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_input_output"></a>3.6. 
      Input / Output
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_directories"></a>3.6.1. 
	Directories
      </h3></div></div></div><p>
	Since version 3.0.0, MIRA now puts all files and directories it
	generates into one sub-directory which is named
	<code class="filename"><em class="replaceable"><code>projectname</code></em>_assembly</code>. This directory contains up to four
	sub-directories:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_d_results</code>: this directory contains all the
	    output files of the assembly in different formats.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_d_info</code>: this directory contains information
	    files of the final assembly. They provide statistics as well as, e.g.,
	    information (easily parsable by scripts) on which read is found in which
	    contig etc.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_d_tmp</code>:
	    this directory contains tmp files and temporary assembly files. It
	    can be safely removed after an assembly as there may be easily a
	    few GB of data in there that are not normally not needed anymore.
	  </p><p>
	    In case of problems: please do not delete. I will get in touch
	    with you for additional information that might possibly be present
	    in the tmp directory.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_d_chkpt</code>: this directory
	    contains checkpoint files needed to resume assemblies that crashed
	    or were stopped.
	  </p></li></ul></div><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_filenames"></a>3.6.2. 
	Filenames
      </h3></div></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_output"></a>3.6.2.1. 
	  Output
	</h4></div></div></div><p>
	  These result output files and sub-directories are placed in in the
	  <em class="replaceable"><code>projectname</code></em>_results directory after a run of MIRA.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_out.&lt;type&gt;</code>
	    </span></dt><dd><p> Assembled project written in type =
	      (<span class="emphasis"><em>maf</em></span> / <span class="emphasis"><em>gap4da</em></span> / <span class="emphasis"><em>caf</em></span> /
	      <span class="emphasis"><em>ace</em></span> / <span class="emphasis"><em>fasta</em></span> /
	      <span class="emphasis"><em>html</em></span> / <span class="emphasis"><em>tcs</em></span> /
	      <span class="emphasis"><em>wig</em></span> / <span class="emphasis"><em>text</em></span>) format by
	      MIRA, final result.
	      </p><p>
		Type <span class="emphasis"><em>gap4da</em></span> is a directory containing
		experiment files and a file of filenames (called 'fofn'), all
		other types are files. <span class="emphasis"><em>gap4da</em></span>,
		<span class="emphasis"><em>caf</em></span>, <span class="emphasis"><em>ace</em></span> contain the
		complete assembly information suitable for import into
		different post-processing tools (gap4, consed and
		others). <span class="emphasis"><em>html</em></span> and
		<span class="emphasis"><em>text</em></span> contain visual representations of
		the assembly suited for viewing in browsers or as simple text
		file. <span class="emphasis"><em>tcs</em></span> is a summary of a contig suited
		for "quick" analysis from command-line tools or even visual
		inspection. <span class="emphasis"><em>wig</em></span> is a file containing
		coverage information (useful for mapping assemblies) which can
		be loaded and shown by different genome browsers (IGB, GMOD,
		USCS and probably many more.
	      </p><p>
		<span class="emphasis"><em>fasta</em></span> contains the contig consensus
		sequences (and .fasta.qual the consensus qualities). Please
		note that they come in two flavours:
		<span class="underline">padded</span>
		and <span class="underline">unpadded</span>. The padded
		versions may contain stars (*) denoting gap base positions
		where there was some minor evidence for additional bases, but
		not strong enough to be considered as a real base. Unpadded
		versions have these gaps removed. Padded versions have an
		additional postfix <span class="emphasis"><em>.padded</em></span>, while
		unpadded versions <span class="emphasis"><em>.unpadded</em></span>.
	      </p></dd><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_LargeContigs_out.&lt;type&gt;</code>
	    </span></dt><dd>
	      These files are only written when MIRA runs in
	      <span class="emphasis"><em>de-novo</em></span> mode. They usually contain a subset
	      of contigs deemed 'large' from the whole project. More details
	      are given in the chapter "working with results of MIRA."
	    </dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_assembly_statistics_and_information_files"></a>3.6.2.2. 
	  Assembly statistics and information files
	</h4></div></div></div><p>
	  These information files are placed in in the
	  <em class="replaceable"><code>projectname</code></em>_info directory after a run of
	  MIRA.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_assembly.txt</code>
	    </span></dt><dd><p>
		This file contains basic information about the
		assembly. MIRA will split the information in two
		parts: information about <span class="emphasis"><em>large</em></span>
		contigs and information about all contigs.
	      </p><p>
		For more information on how to interpret this file,
		please consult the chapter on "Results" of the MIRA
		documentation manual.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		In contrast to other information files, this file
		always appears in the "info" directory, even when just
		intermediate results are reported.
	      </td></tr></table></div></dd><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_contigreadlist.txt</code>
	    </span></dt><dd><p> This file contains information which reads have been
	      assembled into which contigs (or singlets).
	      </p></dd><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_contigstats.txt</code>
	    </span></dt><dd><p> This file contains statistics about the contigs
	      themselves, their length, average consensus quality, number of
	      reads, maximum and average coverage, average read length, number
	      of A, C, G, T, N, X and gaps in consensus.
	      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
		For contigs containing digitally normalised reads, the coverage numbers may sometimes seem strange. E.g.: a contig may contain only one read, but have an average coverage of 3. This means that the read was a representative for 3 reads. The coverage numbers are computed as if all 3 reads had been assembled instead of the representative. In EST/RNASeq projects, these numbers thus represent the (more or less) true expression coverage.
	      </td></tr></table></div></dd><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_consensustaglist.txt</code>
	    </span></dt><dd><p> This file contains
	      information about the tags (and their position) that are present in the
	      consensus of a contig.
	      </p></dd><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_largecontigs.txt</code>
	    </span></dt><dd><p>For de-novo assemblies, this file contains the name of the
	      contigs which pass the (adaptable) 'large contig' criterion.
	      </p></dd><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_readrepeats.lst</code>
	    </span></dt><dd><p>
		Tab delimited file with three columns: read name, repeat level tag, sequence.
	      </p><p>
		This file permits a quick analysis of the repetitiveness of
		different parts of reads in a project. See
		[-SK:rliif] to control from which repetitive level on
		subsequences of reads are written to this file,
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Reads can have more than one entry in this file. E.g., with
		standard settings (<code class="literal">-SK:rliif=6</code>) if the
		start of a read is covered by MNRr, followed by a HAF3 region
		and finally the read ends with HAF6, then there will be two
		lines in the file: one for the subsequence covered by MNRr,
		one for HAF6.
	      </td></tr></table></div></dd><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_readstooshort</code>
	    </span></dt><dd><p> A list containing the
	      names of those reads that have been sorted out of the assembly before any
	      processing started only due to the fact that they were too short.
	      </p></dd><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_readtaglist.txt</code>
	    </span></dt><dd><p> This file contains
	      information about the tags and their position that are present in each
	      read. The read positions are given relative to the forward direction of the
	      sequence (i.e. as it was entered into the the assembly).
	      </p></dd><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_WARNINGS_*.txt</code>
	    </span></dt><dd><p>
		These files collect warning messages MIRA dumped out
		throughout the assembly process. These warnings cover a wide
		area of things monitored by MIRA and can - together with the
		output written to STDOUT - give an insight as to why an
		assembly does not behave as expected. There are three warning
		files representing different levels of
		criticality: <span class="emphasis"><em>critical</em></span>, <span class="emphasis"><em>medium</em></span>
		and <span class="emphasis"><em>minor</em></span>. These files may be empty,
		meaning that no warning of the corresponding level was
		printed. It is strongly suggested to have a look at least at
		critical warnings during and after an assembly run.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		These files are quite new to MIRA and not all warning messages
		appear there yet. This will come over time.
	      </td></tr></table></div></dd><dt><span class="term">
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_error_reads_invalid</code>
	    </span></dt><dd><p> A list of sequences that
	      have been found to be invalid due to various reasons (given in the output of
	      the assembler).
	      </p></dd></dl></div><p>
	</p></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_file_formats"></a>3.6.3. 
	File formats
      </h3></div></div></div><p>
	MIRA can write almost all of the following formats and can read most
	of them.
      </p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	    <code class="filename">ACE</code>
	  </span></dt><dd><p> This old assembly file format used mainly by phrap and
	    consed. Support for .ace output is currently only in test status in
	    MIRA as documentation on that format is ... sparse and I currently
	    don't have access to consed to verify my assumptions.
	    </p><p> Using consed, you will need to load projects with -nophd to
	    view them. Tags /in reads and consensus) are fully supported. The
	    only hitch: consed has a bug which prevents it to read consensus
	    tags which are located throughout the whole file (as MIRA writes
	    per default). The solution to that is easy: filter the CAF file
	    through the fixACE4consed.tcl script which is provided in the MIRA
	    distributions, then all should be well.
	    </p><p> If you don't have consed, you might want to try clview
	    (<a class="ulink" href="http://www.tigr.org/tdb/tgi/software/" target="_top">http://www.tigr.org/tdb/tgi/software/</a>) from TIGR
	    to look at .ace files.
	    </p></dd><dt><span class="term">
	    <code class="filename">BAM</code>
	  </span></dt><dd>
	    The binary cousin of the SAM format. MIRA neither reads nor writes
	    BAM, but BAMs can be created out of SAMs (which can be created via
	    <span class="command"><strong>miraconvert</strong></span>).
	  </dd><dt><span class="term">
	    <code class="filename">CAF</code>
	  </span></dt><dd><p> Common Assembly Format (CAF) developed by the Sanger
	    Centre. <a class="ulink" href="http://www.sanger.ac.uk/resources/software/caf.html" target="_top">http://www.sanger.ac.uk/resources/software/caf.html</a> provides a
	    description of the format and some software documentation as well as the
	    source for compiling caf2gap and gap2caf (thanks to Rob Davies
	    for this).
	    </p></dd><dt><span class="term">
	    <code class="filename">EXP</code>
	  </span></dt><dd><p> Standard experiment files used in genome
	    sequencing. Correct EXP files are expected. Especially the ID
	    record (containing the id of the reading) and the LN record
	    (containing the name of the corresponding trace file) should be
	    correctly set. See <a class="ulink" href="http://www.sourceforge.net/projects/staden/" target="_top">http://www.sourceforge.net/projects/staden/</a> for links to
	    online format description.
	    </p></dd><dt><span class="term">
	    <code class="filename">FASTA</code>
	  </span></dt><dd><p> A simple format for sequence data, see
	    <a class="ulink" href="http://www.ncbi.nlm.nih.gov/BLAST/fasta.html" target="_top">http://www.ncbi.nlm.nih.gov/BLAST/fasta.html</a>. An
	    often used extension of that format is used to also store quality
	    values in a similar fashion, these files have a .fasta.qual
	    ending.
	    </p><p>
	      MIRA writes two kinds of FASTA files for
	      results: <span class="emphasis"><em>padded</em></span> and
	      <span class="emphasis"><em>unpadded</em></span>. The difference is that the padded
	      version still contains the gap (pad) character (an asterisk) at
	      positions in the consensus where some of the reads apparently
	      had some more bases than others but where the consensus routines
	      decided that to treat them as artifacts. The
	      <span class="emphasis"><em>unpadded</em></span> version has the gaps removed.
	    </p></dd><dt><span class="term">
	    <code class="filename">GBF, GBK</code>
	  </span></dt><dd><p> GenBank file format as used at the NCBI to describe
	    sequences. MIRA is able to read and write this format (but only
	    for viruses or bacteria) for using sequences as backbones in an
	    assembly. Features of the GenBank format are also transferred
	    automatically to Staden compatible tags.
	    </p><p>
	      If possible, use GFF3 instead (see below).
	    </p></dd><dt><span class="term">
	    <code class="filename">GFF3</code>
	  </span></dt><dd><p> General feature format used to describe sequences and
	    features on these sequences. MIRA is able to read and write this
	    format.
	    </p></dd><dt><span class="term">
	    <code class="filename">HTML</code>
	  </span></dt><dd><p> Hypertext Markup Language. Projects written in HTML format
	    can be viewed directly with any table capable browser. Display is even
	    better if the browser knows style sheets (CSS).
	    </p></dd><dt><span class="term">
	    <code class="filename">MAF</code>
	  </span></dt><dd><p> MIRA Assembly Format (MAF). A faster and more compact form
	    than EXP, CAF or ACE. See documentation in separate file.
	    </p></dd><dt><span class="term">
	    <code class="filename">PHD</code>
	  </span></dt><dd><p> This file type originates from the phred base caller
	    and contains basically -- along with some other status information -- the
	    base sequence, the base quality values and the peak indices, but not the
	    sequence traces itself.
	    </p></dd><dt><span class="term">
	    <code class="filename">SAM</code>
	  </span></dt><dd><p> The Sequence Alignment/Map Format. MIRA does not write SAM
	    directly, but <span class="command"><strong>miraconvert</strong></span> can be used for
	    converting a MAF (or CAF) file to SAM.
	    </p><p>
	      MIRA cannot read SAM though.
	    </p></dd><dt><span class="term">
	    <code class="filename">SCF</code>
	  </span></dt><dd><p> The Staden trace file format that has established itself as
	    compact standard replacement for the much bigger ABI files. See
	    <a class="ulink" href="http://www.sourceforge.net/projects/staden/" target="_top">http://www.sourceforge.net/projects/staden/</a> for
	    links to online format description.
	    </p><p>
	      The SCF files should be V2-8bit, V2-16bit, V3-8bit or V3-16bit
	      and can be packed with compress or gzip.
	    </p></dd><dt><span class="term">
	    <code class="filename">traceinfo.XML</code>
	  </span></dt><dd><p> XML based file with information relating to
	    traces. Used at the NCBI and ENSEMBL trace archive to store additional
	    information (like clippings, insert sizes etc.) for projects. See further
	    down for for a description of the fields used and
	    <a class="ulink" href="http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&amp;f=rfc&amp;m=main&amp;s=rfc" target="_top">http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&amp;f=rfc&amp;m=main&amp;s=rfc</a> for a full description of all fields.
	    </p></dd><dt><span class="term">
	    <code class="filename">TCS</code>
	  </span></dt><dd><p> Transpose Contig Summary. A text file as written by MIRA
	    which gives a summary of a contig in tabular fashion, one line per
	    base. Nicely suited for "quick" analysis from command line tools,
	    scripts, or even visual inspection in file viewers or spreadsheet
	    programs.
	    </p><p> In the current file version (TCS 1.0), each column is
	    separated by at least one space from the next. Vertical bars are
	    inserted as visual delimiter to help inspection by eye. The
	    following columns are written into the file:
	    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
		  contig name (width 20)
		</p></li><li class="listitem"><p>
		  padded position in contigs (width 3)
		</p></li><li class="listitem"><p>
		  unpadded position in contigs (width 3)
		</p></li><li class="listitem"><p>
		  separator (a vertical bar)
		</p></li><li class="listitem"><p>
		  called consensus base
		</p></li><li class="listitem"><p>
		  quality of called consensus base (0-100), but MIRA itself caps at 90.
		</p></li><li class="listitem"><p>
		  separator (a vertical bar)
		</p></li><li class="listitem"><p>
		  total coverage in number of reads. This number can be higher than the
		  sum of the next five columns if Ns or IUPAC bases are present in the
		  sequence of reads.
		</p></li><li class="listitem"><p>
		  coverage of reads having an "A"
		</p></li><li class="listitem"><p>
		  coverage of reads having an "C"
		</p></li><li class="listitem"><p>
		  coverage of reads having an "G"
		</p></li><li class="listitem"><p>
		  coverage of reads having an "T"
		</p></li><li class="listitem"><p>
		  coverage of reads having an "*" (a gap)
		</p></li><li class="listitem"><p>
		  separator (a vertical bar)
		</p></li><li class="listitem"><p>
		  quality of "A" or "--" if none
		</p></li><li class="listitem"><p>
		  quality of "C" or "--" if none
		</p></li><li class="listitem"><p>
		  quality of "G" or "--" if none
		</p></li><li class="listitem"><p>
		  quality of "T" or "--" if none
		</p></li><li class="listitem"><p>
		  quality of "*" (gap) or "--" if none
		</p></li><li class="listitem"><p>
		  separator (a vertical bar)
		</p></li><li class="listitem"><p>
		  Status. This field sums up the evaluation of MIRA whether you should
		  have a look at this base or not. The content can be one of the following:
		</p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
		      everything OK: a colon (:)
		    </p></li><li class="listitem"><p>
		      unclear base calling (IUPAC base): a "!M"
		    </p></li><li class="listitem"><p>
		      potentially problematic base calling involving a gap or low quality: a "!m"
		    </p></li><li class="listitem"><p>
		      consensus tag(s) of MIRA that hint to problems: a "!$". Currently,
		      the following tags will lead to this marker: SRMc, WRMc, DGPc, UNSc,
		      IUPc.
		    </p></li></ul></div></li><li class="listitem"><p>
		  list of a consensus tags at that position, tags are delimited by a
		  space. E.g.: "DGPc H454"
		</p></li></ol></div></dd></dl></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_stdout_stderr"></a>3.6.4. 
	STDOUT/STDERR
      </h3></div></div></div><p>
	The actual stage of the assembly is written to STDOUT, giving status messages
	on what MIRA is actually doing. Dumping to STDERR is almost not used
	anymore by MIRA, remnants will disappear over time.
      </p><p>
	Some debugging information might also be written to STDOUT if MIRA
	generates error messages.
      </p><p>
	On errors, MIRA will dump these also to STDOUT. Basically, three error classes
	exist:
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    WARNING: Messages in this error class do not stop the assembly but
	    are meant as an information to the user. In some rare cases these
	    errors are due to (an always possible) error in the I/O routines
	    of MIRA, but nowadays they are mostly due to unexpected (read:
	    wrong) input data and can be traced back to errors in the
	    preprocessing stages. If these errors arise, you
	    definitively <span class="bold"><strong>DO</strong></span> want to check how
	    and why these errors came into those files in the first place.
	  </p><p>
	    Frequent cause for warnings include missing SCF files, SCF files
	    containing known quirks, EXP files containing known quirks etc.
	  </p></li><li class="listitem"><p>
	    FATAL: Messages in this error class actually stop the
	    assembly. These are mostly due to missing files that MIRA needs or
	    to very garbled (wrong) input data.
	  </p><p>
	    Frequent causes include naming an experiment file in the 'file of filenames'
	    that could not be found on the disk, same experiment file twice in the
	    project, suspected errors in the EXP files, etc.
	  </p></li><li class="listitem"><p>
	    INTERNAL: These are true programming errors that were caught by internal
	    checks. Should this happen, please mail the output of STDOUT and STDERR to
	    the author.
	  </p></li></ol></div><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_ssaha2smalt"></a>3.6.5. 
	SSAHA2 / SMALT ancillary data
      </h3></div></div></div><p>
	The <span class="command"><strong>ssaha2</strong></span> or <span class="command"><strong>smalt</strong></span> programs -
	both from the Sanger Centre - can be used to detect possible vector
	sequence stretches in the input data for the assembly.  MIRA can load
	the result files of a
	<span class="command"><strong>ssaha2</strong></span> or <span class="command"><strong>smalt</strong></span> run and
	interpret the results to tag the possible vector sequences at the ends
	of reads.
      </p><p>
	Note that this also uses the parameters
	[-CL:msvsgs:msvsmfg:msvsmeg] (see below).
      </p><p>
	ssaha2 must be called like this "<code class="literal">ssaha2
	  &lt;ssaha2options&gt; vector.fasta sequences.fasta</code>"
	to generate an output that can be parsed by MIRA. In the above
	example, replace <code class="filename">vector.fasta</code> by the name
	of the file with your vector sequences and
	<code class="filename">sequences.fasta</code> by the name of the file
	containing your sequencing data.
      </p><p>
	smalt must be called like this: "<code class="literal">smalt map -f ssaha
	  &lt;ssaha2options&gt; hash_index sequences.fasta</code>"
      </p><p>
	This makes you basically independent from any other commercial or
	license-requiring vector screening software. For Sanger reads, a
	combination of <span class="command"><strong>lucy</strong></span> and
	<span class="command"><strong>ssaha2</strong></span> or <span class="command"><strong>smalt</strong></span> together with
	this parameter should do the trick. For reads coming from 454
	pyro-sequencing, <span class="command"><strong>ssaha2</strong></span> or
	<span class="command"><strong>smalt</strong></span> and this parameter will also work very
	well. See the usage manual for a walkthrough example on how to use
	SSAHA2 / SMALT screening data.
      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	The output format of SSAHA2 must the native output format
	(<code class="literal">-output ssaha2</code>). For SMALT, the output
	option <code class="literal">-f ssaha</code> must be used. Other formats cannot
	be parsed by MIRA.
      </td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	I currently use the following SSAHA2 options:
	<code class="literal">-kmer 8 -skip 1 -seeds 1 -score 12 -cmatch 9 -ckmer
	  6</code></td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	Anyone contributing SMALT parameters?
      </td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	The sequence vector clippings generated from SSAHA2 /
	SMALT data do not replace sequence vector clippings loaded via
	the EXP, CAF or XML files, they rather extend them.
      </td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_xml_traceinfo"></a>3.6.6. 
	XML TRACEINFO ancillary data
      </h3></div></div></div><p>
	MIRA extracts the following data from the TRACEINFO files:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    trace_name (required)
	  </p></li><li class="listitem"><p>
	    trace_file (recommended)
	  </p></li><li class="listitem"><p>
	    trace_type_code (recommended)
	  </p></li><li class="listitem"><p>
	    trace_end (recommended)
	  </p></li><li class="listitem"><p>
	    clip_quality_left (recommended)
	  </p></li><li class="listitem"><p>
	    clip_quality_right (recommended)
	  </p></li><li class="listitem"><p>
	    clip_vector_left (recommended)
	  </p></li><li class="listitem"><p>
	    clip_vector_right (recommended)
	  </p></li><li class="listitem"><p>
	    strain (recommended)
	  </p></li><li class="listitem"><p>
	    template_id (recommended for paired end)
	  </p></li><li class="listitem"><p>
	    insert_size (recommended for paired end)
	  </p></li><li class="listitem"><p>
	    insert_stdev (recommended for paired end)
	  </p></li><li class="listitem"><p>
	    machine_type (optional)
	  </p></li><li class="listitem"><p>
	    program_id (optional)
	  </p></li></ul></div><p>
      </p><p>
	Other data types are also read, but the info is not used.
      </p><p>
	Here's the example for a TRACEINFO file with ancillary info:
      </p><pre class="screen">
&lt;?xml version="1.0"?&gt;
&lt;trace_volume&gt;
&lt;trace&gt;
  &lt;trace_name&gt;GCJAA15TF&lt;/trace_name&gt;
  &lt;program_id&gt;PHRED (0.990722.G) AND TTUNER (1.1)&lt;/program_id&gt;
  &lt;template_id&gt;GCJAA15&lt;/template_id&gt;
  &lt;trace_direction&gt;FORWARD&lt;/trace_direction&gt;
  &lt;trace_end&gt;F&lt;/trace_end&gt;
  &lt;clip_quality_left&gt;3&lt;/clip_quality_left&gt;
  &lt;clip_quality_right&gt;622&lt;/clip_quality_right&gt;
  &lt;clip_vector_left&gt;1&lt;/clip_vector_left&gt;
  &lt;clip_vector_right&gt;944&lt;/clip_vector_right&gt;
  &lt;insert_stdev&gt;600&lt;/insert_stdev&gt;
  &lt;insert_size&gt;2000&lt;/insert_size&gt;
&lt;/trace&gt;
&lt;trace&gt;
  ...
&lt;/trace&gt;
...
&lt;/trace_volume&gt;</pre><p>
	See
	<a class="ulink" href="http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&amp;f=rfc&amp;m=main&amp;s=rfc" target="_top">http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&amp;f=rfc&amp;m=main&amp;s=rfc</a>
	for a full description of all fields and more info on the TRACEINFO XML format.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_contig_naming"></a>3.6.7. 
	Contig naming
      </h3></div></div></div><p>
	MIRA names contigs the following
	way: <span class="emphasis"><em>&lt;projectname&gt;_&lt;contigtype&gt;&lt;number&gt;</em></span>. While <span class="emphasis"><em>&lt;projectname&gt;</em></span>
	is dictated by the  [--project=] parameter
	and <span class="emphasis"><em>&lt;number&gt;</em></span> should be clear,
	the <span class="emphasis"><em>&lt;contigtype&gt;</em></span> might need additional
	explaining. There are currently three contig types existing:
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    _c: these are "normal" contigs
	  </p></li><li class="listitem"><p>
	    _rep_c: only for genome assembly mode. These are contigs
	    containing only repetitive areas. These contigs
	    had <span class="emphasis"><em>_lrc</em></span> as type in previous version of MIRA,
	    this was changed to the <span class="emphasis"><em>_rep_c</em></span> to make things
	    clearer.
	  </p></li><li class="listitem"><p>
	    _s: these are singlet-contigs. Technically: "contigs" with a
	    single read.
	  </p></li><li class="listitem"><p>
	    _dn: these is an additional contig type which can occur when MIRA
	    ran a digital normalisation step during the assembly. Contigs
	    which contain reads completely covered by a DGNr tag will get an
	    additional "_dn" as part of their name to show that they contain
	    read representatives for digital normalisation. E.g.:
	    "contig_dn_c1".
	  </p><p>
	    Reads covered only partly by the DGNr tag do not trigger the _dn
	    naming.
	  </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note: Important side note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Important side note</th></tr><tr><td align="left" valign="top"> Due to the digital
	    normalisation step, the coverage numbers in the info file
	    regarding contig statistics will not represent the number of
	    reads in the contig, but they will show an approximation of
	    the true coverage or expression value as if there had not been
	    a digital normalisation step performed. The approximation may
	    be around 10 to 20% below the true value.
	  </td></tr></table></div></li></ol></div><p>
	Basically, for genome assemblies MIRA starts to build contigs in areas
	which seem "rock solid", i.e., not a repetitive region (main decision
	point) and nice coverage of good reads. Contigs which started like
	this get a <span class="emphasis"><em>_c</em></span> name. If during the assembly MIRA
	reaches a point where it cannot start building a contig in a
	non-repetitive region, it will name the contig
	<span class="emphasis"><em>_rep_c</em></span> instead of <span class="emphasis"><em>_c</em></span>. This
	is why "_rep_c" contigs occur late in a genome assembly.
      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	MIRA has a different understanding of "rock solid" when in EST/RNASeq
	assembly: here, MIRA will try to reconstruct a full length gene
	sequence, starting with the most abundant genes.
      </td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	Depending on the settings of [-AS:mrpc], your project may or
	may not contain <span class="emphasis"><em>_s</em></span> singlet-contigs. Also note
	that reads landing in the debris file will not get assigned to
	singlet-contigs and hence not get <span class="emphasis"><em>_s</em></span> names.
      </td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_recovering_strain_specific_consensus"></a>3.6.8. 
	Recovering strain specific consensus as FASTA
      </h3></div></div></div><p>
	In case you used strain information in an assembly, you can
	recover the consensus for just any given strain
	by using <span class="command"><strong>miraconvert</strong></span> and convert from a
	full assembly format (e.g. MAF or CAF) which also carries
	strain information to FASTA. MIRA will automatically detect
	the strain information and create one FASTA file per strain
	encountered.
      </p><p>
	It will also create a blend of all strains encountered and
	conveniently add "AllStrains" to the name of these files. Note that
	this blend may or may not be something you need, but in some
	cases I found it to be useful.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_tags_used_in_the_assembly_by_mira_and_edit"></a>3.7. 
      Tags used in the assembly by MIRA and EdIt
    </h2></div></div></div><p>
      MIRA uses and sets a couple of tags during the assembly process. That
      is, if information is known before the assembly, it can be stored in tags (in
      the EXP and CAF formats) and will be used in the assembly.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_tags_read_and_used"></a>3.7.1. 
	Tags read (and used)
      </h3></div></div></div><p>
	This section lists "foreign" tags, i.e., tags that whose definition was made
	by other software packages than MIRA.
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    ALUS, REPT: Sequence stretches tagged as ALUS (ALU Sequence) or REPT
	    (general repetitive sequence) will be handled with extreme care during the
	    assembly process. The allowed error rate after automatic contig editing
	    within these stretches is normally far below the general allowed error rate,
	    leading to much higher stringency during the assembly process and
	    subsequently to a better repeat resolving in many cases.
	  </p></li><li class="listitem"><p>
	    Fpas: GenBank feature for a poly-A sequence. Used in EST, cDNA or
	    transcript assembly. Either read in the input files or set when using
	    [-CL:cpat]. This allows to keep the poly-A sequence in
	    the reads during assembly without them interfering as massive
	    repeats or as mismatches.
	  </p></li><li class="listitem"><p>
	    FCDS, Fgen: GenBank features as described in GBF/GBK files or set in the
	    Staden package are used to make some SNP impact analysis on genes.
	  </p></li><li class="listitem"><p>
	    other. All other tags in reads will be read and passed through the
	    assembly without being changed and they currently do not influence the
	    assembly process.
	  </p></li></ul></div><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_tags_set_and_used"></a>3.7.2. 
	Tags set (and used)
      </h3></div></div></div><p>
	This section lists tags which MIRA sets (and reads of course), but that other
	software packages might not know about.
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    UNSr, UNSc: <span class="bold"><strong>UNS</strong></span>ure
	    in <span class="bold"><strong>R</strong></span>ead
	    respectively <span class="bold"><strong>C</strong></span>ontig.  These tags
	    denote positions in an assembly with conflicts that could not be
	    resolved automatically by MIRA. These positions should be looked
	    at during the finishing process.
	  </p><p>
	    For assemblies using good sequences and enough coverage, something
	    0.01% of the consensus positions have such a tag. (e.g. ~300 UNSc
	    tags for a genome of 3 megabases).
	  </p></li><li class="listitem"><p>
	    SRMr, WRMc: <span class="bold"><strong>S</strong></span>trong <span class="bold"><strong>R</strong></span>epeat <span class="bold"><strong>M</strong></span>arker and
	    <span class="bold"><strong>W</strong></span>eak <span class="bold"><strong>R</strong></span>epeat <span class="bold"><strong>M</strong></span>arker. These
	    tags are set in two flavours: as
	    SRM<span class="bold"><strong>r</strong></span> and
	    WRM<span class="bold"><strong>r</strong></span> when set in reads, and as
	    SRM<span class="bold"><strong>c</strong></span> and
	    WRM<span class="bold"><strong>c</strong></span> when set in the
	    consensus. These tags are used on an individual per base basis for
	    each read. They denote bases that have been identified as crucial
	    for resolving repeats, often denoting a single SNP within several
	    hundreds or thousands of bases. While a SRM is quite certain, the
	    WRM really is either weak (there wasn't enough comforting
	    information in the vicinity to be really sure) or involves gap
	    columns (which is always a bit tricky).
	  </p><p>
	    MIRA will automatically set these tags when it encounters repeats
	    and will tag exactly those bases that can be used to discern the
	    differences.
	  </p><p>
	    Seeing such a tag in the consensus means that MIRA was not able to
	    finish the disentanglement of that special repeat stretch or that
	    it found a new one in one of the last passes without having the
	    opportunity to resolve the problem.
	  </p></li><li class="listitem"><p>
	    DGPc: <span class="bold"><strong>D</strong></span>ubious <span class="bold"><strong>G</strong></span>ap <span class="bold"><strong>P</strong></span>osition in
	    <span class="bold"><strong>C</strong></span>onsensus. Set whenever the gap to base ratio in a column of 454
	    reads is between 40% and 60%.
	  </p></li><li class="listitem"><p>
	    SAO, SRO, SIO: <span class="bold"><strong>S</strong></span>NP intr<span class="bold"><strong>A</strong></span> <span class="bold"><strong>O</strong></span>rganism,
	    <span class="bold"><strong>S</strong></span>NP <span class="bold"><strong>R</strong></span> <span class="bold"><strong>O</strong></span>rganism, <span class="bold"><strong>S</strong></span>NP <span class="bold"><strong>I</strong></span>ntra
	    and inter <span class="bold"><strong>O</strong></span>rganism. As for SRM
	    and WRM, these tags have a <span class="bold"><strong>r</strong></span>
	    appended when set in reads and
	    a <span class="bold"><strong>c</strong></span> appended when set in the
	    consensus. These tags denote SNP positions.
	  </p><p>
	    MIRA will automatically set these tags when it encounters SNPs and
	    will tag exactly those bases that can be used to discern the
	    differences.  They denote SNPs as they occur within an organism
	    (SAO), between two or more organisms (SRO) or within and between
	    organisms (SIO).
	  </p><p>
	    Seeing such a tag in the consensus means that MIRA set this as a
	    valid SNP in the assembly pass. Seeing such tags only in reads (but not in
	    the consensus) shows that in a previous pass, MIRA thought these
	    bases to be SNPs but that in later passes, this SNP does not appear anymore
	    (perhaps due to resolved misassemblies).
	  </p></li><li class="listitem"><p>
	    STMS: (only hybrid assemblies). The <span class="bold"><strong>S</strong></span>equencing <span class="bold"><strong>T</strong></span>ype
	    <span class="bold"><strong>M</strong></span>ismatch <span class="bold"><strong>S</strong></span>olved
	    is tagged to positions in the assembly where the consensus of
	    different sequencing technologies (Sanger, 454, Ion Torrent, Solexa, PacBio, SOLiD)
	    reads differ, but MIRA thinks it found out the correct
	    solution. Often this is due to low coverage of one of the types
	    and an additional base calling error.
	  </p><p>
	    Sometimes this depicts real differences where possible explanation
	    might include: slightly different bugs were sequenced or a
	    mutation occurred during library preparation.
	  </p></li><li class="listitem"><p>
	    STMU: (only hybrid assemblies). The <span class="bold"><strong>S</strong></span>equencing <span class="bold"><strong>T</strong></span>ype
	    <span class="bold"><strong>M</strong></span>ismatch <span class="bold"><strong>U</strong></span>nresolved
	    is tagged to positions in the assembly where the consensus of
	    different sequencing technologies (Sanger, 454, Ion Torrent, Solexa, SOLiD)
	    reads differ, but MIRA could not find a good resolution. Often this
	    is due to low coverage of one of the types and an additional base
	    calling error.
	  </p><p>
	    Sometimes this depicts real differences where possible explanation
	    might include: slightly different bugs were sequenced or a mutation
	    occurred during library preparation.
	  </p></li><li class="listitem"><p>
	    MCVc: The <span class="bold"><strong>M</strong></span>issing <span class="bold"><strong>C</strong></span>o{V}erage in <span class="bold"><strong>C</strong></span>onsensus.
	    Set in assemblies with more than one strain. If a strain has no coverage at
	    a certain position, the consensus gets tagged with this tag (and the name of
	    the strain which misses this position is put in the comment). Additionally,
	    the sequence in the result files for this strain will have an @ character.
	  </p></li><li class="listitem"><p>
	    MNRr: (only with [-KS:mnr] active). The <span class="bold"><strong>M</strong></span>asked
	    <span class="bold"><strong>N</strong></span>asty <span class="bold"><strong>R</strong></span>epeat tags are set over those parts of a read that
	    have been detected as being many more times present than the average
	    sub-sequence. MIRA will hide these parts during the initial
	    all-against-all overlap finding routine (SKIM3) but will otherwise happily
	    use these sequences for consensus generation during contig building.
	  </p></li><li class="listitem"><p>
	    FpAS: See "Tags read (and used)" above.
	  </p></li><li class="listitem"><p>
	    ED_C, ED_I, ED_D: EDit Change, EDit Insertion, EDit Deletion. These
	    tags are set by the integrated automatic editor EdIt and show which edit
	    actions have been performed.
	  </p></li><li class="listitem"><p>
	    HAF2, HAF3, HAF4, HAF5, HAF6, HAF7. These
	    are <span class="bold"><strong>HA</strong></span>sh <span class="bold"><strong>F</strong></span>requency
	    tags which show the status of read parts in comparison to the
	    whole project. Only set if  [-AS:ard] is active (default
	    for genome assemblies).
	  </p><p>
	    More info on how to use the information conveyed by HAF tags in
	    the section dealing with repeats and HAF tags in finishing
	    programs further down in this manual.
	  </p><p>
	    HAF2 coverage below average ( standard setting at &lt; 0.5 times average)
	  </p><p>
	    HAF3 coverage is at average ( standard setting at &#8805; 0.5 times average and &#8804; 1.5 times average)
	  </p><p>
	    HAF4 coverage above average ( standard setting at &gt; 1.5 times average and &lt; 2 times average)
	  </p><p>
	    HAF5 probably repeat ( standard setting at &#8805; 2 times average and &lt; 5 times average)
	  </p><p>
	    HAF6 'heavy' repeat ( standard setting at &gt; 8 times average)
	  </p><p>
	    HAF7 'crazy' repeat ( standard setting at &gt; 20 times average)
	  </p></li></ul></div><p>
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_contigs_singlets_debris"></a>3.8. 
      Where reads end up: contigs, singlets, debris
    </h2></div></div></div><p>
      At the start, things are simple: a read either aligns with other reads or it does not. Reads which
      align with other reads form contigs, and these MIRA will save in the results with a contig name
      of <span class="emphasis"><em>_c</em></span>.
    </p><p>
      However, not all reads can be placed in an assembly. This can have several reasons and
      these reads may end up at two different places in the result files: either in the
      <span class="emphasis"><em>debris</em></span> file, then just as a name entry, or as singlet (a "contig"
      with just one read) in the regular results.
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  reads are too short and get filtered out (before or after the MIRA
	  clipping stages). These invariably land in the debris file.
	</p></li><li class="listitem"><p>
	  reads are real singlets: they contain genuine sequence but have no
	  overlap with any other read. These get either caught by the
	  [-CL:pec] clipping filter or during the SKIM phase
	</p></li><li class="listitem"><p>
	  reads contain mostly or completely junk.
	</p></li><li class="listitem"><p>
	  reads contain chimeric sequence (therefore: they're also junk)
	</p></li></ol></div><p>
      MIRA filters out these reads in different stages: before and after read
      clipping, during the SKIM stage, during the Smith-Waterman overlap
      checking stage or during contig building. The exact place where these
      single reads land is dependent on why they do not align with other
      reads. Reads landing in the debris file will have the reason and stage
      attached to the decision.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_snp_discovery"></a>3.9. 
      Detection of bases distinguishing non-perfect repeats and SNP discovery
    </h2></div></div></div><p>
      MIRA is able to find and tag SNPs in any kind of data -- be it genomic
      or EST -- in both de-novo and mapping assemblies ... provided it knows
      which read in an assembly is coming from which strain, cell line or
      organism.
    </p><p>
      The SNP detection routines are based on the same routines as the
      routines for detecting non-perfect repeats. In fact, MIRA can even
      distinguish between bases marking a misassembled repeat from bases
      marking a SNP within the same project.
    </p><p>
      All you need to do to enable this feature is to set
      [-CO:mr=yes] (which is standard in all
      <code class="literal">--job=...</code> incantations of <span class="command"><strong>mira</strong></span> and
      in some steps of <span class="command"><strong>miraSearchESTSNPs</strong></span>. Furthermore, you
      will need to provide <span class="emphasis"><em>strain information</em></span>, either in
      the manifest file or in ancillary NCBI TRACEINFO XML files.
    </p><p>
      The effect of using strain names attached to reads can be described
      briefly like this. Assume that you have 6 reads (called R1 to R6), three
      of them having an <code class="literal">A</code> at a given position, the other
      three a <code class="literal">C</code>.
    </p><pre class="screen">
R1   ......A......
R2   ......A......
R3   ......A......
R4   ......C......
R5   ......C......
R6   ......C......</pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      This example is just that: an example. It uses just 6 reads, with two
      times three reads as read groups for demonstration purposes and without
      looking at qualities. For MIRA to recognise SNPs, a few things must come
      together (e.g. for many sequencing technologies it wants forward and
      backward reads when in de-novo assembly) and a couple of parameters can
      be set to adjust the sensitivity. Read more about the parameters:
      [-CO:mrpg:mnq:mgqrt:emea:amgb:amgbemc:amgbnbs]</td></tr></table></div><p>
      Now, assume you did not give any strain information. MIRA will most
      probably recognise a problem and, having no strain information, assume
      it made an error by assembling two different repeats of the same
      organism. It will tag the bases in the reads with repeat marker tags
      (SRMr) and the base in the consensus with a SROc tag (to point at an
      unresolved problem). In a subsequent pass, MIRA will then not assemble
      these six reads together again, but create two contigs like this:
    </p><pre class="screen">
Contig1:
R1   ......A......
R2   ......A......
R3   ......A......

Contig2:
R4   ......C......
R5   ......C......
R6   ......C......</pre><p>
      The bases in the repeats will keep their SROr tags, but the consensus
      base of each contig will not get SROc as there is no conflict anymore.
    </p><p>
      Now, assume you gave reads R1, R2 and R3 the strain information "human",
      and read R4, R5 and R6 "chimpanzee". MIRA will then create this:
    </p><pre class="screen">
R1 (hum)  ......<span class="bold"><strong>A</strong></span>......
R2 (hum)  ......<span class="bold"><strong>A</strong></span>......
R3 (hum)  ......<span class="bold"><strong>A</strong></span>......
R4 (chi)  ......<span class="bold"><strong>C</strong></span>......
R5 (chi)  ......<span class="bold"><strong>C</strong></span>......
R6 (chi)  ......<span class="bold"><strong>C</strong></span>......</pre><p>
      Instead of creating two contigs, it will create again one contig ... but
      it will tag the bases in the reads with a SROr tag and the position in
      the contig with a SROc tag. The SRO tags (<span class="bold"><strong>S</strong></span>NP inte<span class="bold"><strong>R</strong></span>
      <span class="bold"><strong>O</strong></span>rganisms) tell you: there's a SNP
      between those two (or multiple) strains/organisms/whatever.
    </p><p>
      Changing the above example a little, assume you have this assembly early
      on during the MIRA process:
    </p><pre class="screen">
R1 (hum)  ......A......
R2 (hum)  ......A......
R3 (hum)  ......A......
R4 (chi)  ......A......
R5 (chi)  ......A......
R6 (chi)  ......A......
R7 (chi)  ......C......
R8 (chi)  ......C......
R9 (chi)  ......C......</pre><p>
      Because "chimp" has a SNP within itself (<code class="literal">A</code> versus
      <code class="literal">C</code>) and there's a SNP between "human" and "chimp"
      (also <code class="literal">A</code> versus <code class="literal">C</code>), MIRA will see a
      problem and set a tag, this time a SIOr tag: <span class="bold"><strong>S</strong></span>NP <span class="bold"><strong>I</strong></span>ntra- and
      inter <span class="bold"><strong>O</strong></span>rganism.
      </p><p>
	MIRA does not like conflicts occurring within an organism and will try
	to resolve these cleanly. After setting the SIOr tags, MIRA will
	re-assemble in subsequent passes this:
    </p><pre class="screen">
Contig1:
R1 (hum)  ......<span class="bold"><strong>A</strong></span>......
R2 (hum)  ......<span class="bold"><strong>A</strong></span>......
R3 (hum)  ......<span class="bold"><strong>A</strong></span>......
R4 (chi)  ......<span class="bold"><strong>A</strong></span>......
R5 (chi)  ......<span class="bold"><strong>A</strong></span>......
R6 (chi)  ......<span class="bold"><strong>A</strong></span>......

Contig2:
R7 (chi)  ......<span class="bold"><strong>C</strong></span>......
R8 (chi)  ......<span class="bold"><strong>C</strong></span>......
R9 (chi)  ......<span class="bold"><strong>C</strong></span>......</pre><p>
      The reads in Contig1 (hum+chi) and Contig2 (chi) will keep their SIOr
      tags, the consensus will have no SIOc tag as the "problem" was
      resolved.
    </p><p>
      When presented to conflicting information regarding SNPs and possible
      repeat markers or SNPs within an organism, MIRA will always first try to
      resolve the repeats marker. Assume the following situation:
    </p><pre class="screen">
R1 (hum)  ......A...T......
R2 (hum)  ......A...G......
R3 (hum)  ......A...T......
R4 (chi)  ......C...G......
R5 (chi)  ......C...T......
R6 (chi)  ......C...G......</pre><p>
      While the first discrepancy column can be "explained away" by a SNP
      between organisms (it will get a SROr/SROc tag), the second column
      cannot and will get a SIOr/SIOc tag. After that, MIRA opts to get the
      SIO conflict resolved:
    </p><pre class="screen">
Contig1:
R1 (hum)  ......A...T......
R3 (hum)  ......A...T......
R5 (chi)  ......C...T......

Contig2:
R2 (hum)  ......A...G......
R4 (chi)  ......C...G......
R6 (chi)  ......C...G......</pre></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_data_reduction"></a>3.10. 
      Data reduction: subsampling vs. lossless digital normalisation
    </h2></div></div></div><p>
      Some data sets have way too much data. Sometimes it is simply more than
      needed like, e.g., performing a de-novo genome assembly with reads
      enough for 300x coverage is like taking a sledgehammer for cracking a
      nut. Sometimes it is even more than is good for an assembly (see also:
      motif dependent sequencing errors).
    </p><p>
      MIRA being an overlap-based assembler, reducing a data set helps to keep
      time and memory requirements low. There are basically two ways to
      perform this: reduction by subsampling and reduction by digital
      normalisation. Both methods have their pros and cons and can be used
      effectively in different scenarios.
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	  <span class="emphasis"><em>Subsampling</em></span> is a process to create a smaller,
	  hopefully representative set from a larger data set.
	</p><p>
	  In sequencing, various ways exist to perform subsampling. As
	  sequencing data sets from current sequencing technologies can be
	  seen as essentially randomised when coming fresh from the machine,
	  the selection step can be as easy as selecting the
	  first <span class="emphasis"><em>n</em></span> reads. When the input data set is not
	  random (e.g. in SAM/BAM files with mapped data), one must resort to
	  random selection of reads.
	</p><p>
	  Subsampling must be done by the user prior to assembly with MIRA.
	</p><p>
	  On the upside, subsampling preserves the exact copy number structure
	  of the input data set: a repeat with n copies in a genome will
	  always be represented by reads forming n copies of the repeat in the
	  reduced data set. Furthermore, subsampling is comparatively
	  insensitive to motif dependent sequencing errors. On the downside,
	  subsampling will more probably loose rare events of the data set
	  (e.g., rare SNPs of a cell population or rare transcripts in
	  EST/RNASeq). Also, in EST/RNASeq projects, subsampling will not be
	  able to reduce extraordinary coverage events to a level which make
	  the assembly not painfully slow. Examples for the later being rRNA
	  genes or highly expressed house-keeping genes where todays' Illumina
	  data sets sometimes contains enough data to reach coverage numbers
	  &#8805; 100,000x or even a million x.
	</p><p>
	  Subsampling should therefore be used for single genome de-novo
	  assemblies; or for EST/RNASeq assemblies which need reliable
	  coverage numbers for transcript expression data but where at least
	  all rDNA has been filtered out prior to assembly.
	</p></li><li class="listitem"><p>
	  <span class="emphasis"><em>Digital normalisation</em></span> is a process to perform a
	  reduction of sequencing data redundancy. It was made known to a
	  wider audience by the paper <span class="emphasis"><em>"A Reference-Free Algorithm
	  for Computational Normalization of Shotgun Sequencing
	  Data"</em></span> by Brown et al. (see
	  <a class="ulink" href="http://arxiv.org/abs/1203.4802" target="_top">http://arxiv.org/abs/1203.4802</a>).
	</p><p>
	  The normalisation process works by progressively going through the
	  sequencing data and selecting reads which bring new, previously
	  unseen information to the assembly and discarding those which
	  describe nothing new. For single genome assemblies, this has the
	  effect that repeats with n copies in the genome are afterwards
	  present often with just enough reads to reconstruct only a single
	  copy of the repeat. In EST/RNASeq assemblies, this leads to
	  reconstructed transcripts having all the more or less same coverage.
	</p><p>
	  The normalisation process as described in the paper allows for a
	  certain lossiness during the data reduction as it was developed to
	  cope with billions of reads. E.g., it will often loose borders in
	  genome reorganisation events or SNP information from ploidies, from
	  closely related genes copies or from closely related species.
	</p><p>
	  MIRA implements a variant of the algorithm: the <span class="emphasis"><em>lossless
	  digital normalisation</em></span>. Here, normalised data has copy
	  numbers reduced like in the original algorithm, but all variants
	  (SNPs, borders of reorganisation events etc.) present in the
	  original data set are retained in the reduced data set. Furthermore,
	  the normalisation is parameterised to take place only for
	  excessively repetitive parts of a data set which would lead to
	  overly increased run-time and memory consumption. This gives the
	  assembler the opportunity to correctly evaluate and work with
	  repeats which do not occur "too often" in a data set while still
	  being able to reconstruct at least one copy of the really nasty
	  repeats.
	</p><p>
	  Digital normalisation should not be done prior to an assembly with
	  MIRA, rather the MIRA parameter to perform a digital normalisation
	  on the complete data set should be used.
	</p><p>
	  The lossless digital normalisation of MIRA should be used for
	  EST/RNASeq assemblies containing highly repetitive data. Metagenome
	  assemblies may also profit from this feature.
	</p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	    MIRA keeps track of the approximate coverage represented by the
	    reads chosen in the digital normalisation process. That is, MIRA is
	    able to give approximate coverage numbers as if digital
	    normalisation had never happened. The approximation may be around 10
	    to 20% below the true value. Contigs affected by this coverage
	    approximation are denoted with an additional "_dn" in their name.
	  </p><p>
	    Due to the digital
	    normalisation step, the coverage numbers in the info file
	    regarding contig statistics will not represent the number of
	    reads in the contig, but they will show an approximation of
	    the true coverage or expression value as if there had not been
	    a digital normalisation step performed.
	</p></td></tr></table></div></li></ul></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_caveats"></a>3.11. 
      Caveats
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_using_artificial_reads"></a>3.11.1. 
	Using data not from sequencing instruments: artificial / synthetic reads
      </h3></div></div></div><p>
	The default parameters for MIRA assemblies work best when given real
	sequencing data and they even expect the data to behave like real
	sequencing data. But some assembly strategies work in multiple rounds,
	using so called "artificial" or "synthetic" reads in later rounds,
	i.e., data which was not generated through sequencing machines but
	might be something like the consensus of previous assemblies.
      </p><p>
	If one doesn't take utter care to make these artificial reads at least
	behave a little bit like real sequencing data, a number of quality
	insurance algorithms of MIRA might spot that they "look funny" and
	trim back these artificial reads ... sometimes even removing them
	completely.
      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note: Summary tips for creating artificial reads for MIRA assemblies"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Summary tips for creating artificial reads for MIRA assemblies</th></tr><tr><td align="left" valign="top"><p>
	  The following should lead to the least amount of surprises for most
	  assembly use cases when calling MIRA only with the most basic
	  switches <code class="literal">--project=... --job=...</code>
	</p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><span class="bold"><strong>Length:</strong></span> between 50 and 20000 bp
	  </li><li class="listitem"><span class="bold"><strong>Quality values:</strong></span> give your
	    artificial reads quality values. Using <span class="emphasis"><em>30</em></span>
	    as quality value for your bases should be OK for most
	    applications.
	  </li><li class="listitem"><span class="bold"><strong>Orientation:</strong></span> for every read you
	    create, create a read with the same data (bases and quality
	    values) in reverse complement direction.
	  </li></ol></div></td></tr></table></div><p>
	The following list gives all the gory details on how synthetic reads
	should look like or which MIRA algorithms to switch off in certain
	cases:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    Forward and reverse complement directions: most sequencing
	    technologies and strategies yield a mixture of reads with both
	    forward and reverse complement direction to the DNA sequenced. In
	    fact, having both directions allows for a much better quality
	    control of an alignment as sequencing technology dependent
	    sequencing errors will often affect only one direction at a given
	    place and not both (the exception being homopolymers and 454).
	  </p><p>
	    The MIRA <span class="emphasis"><em>proposed end clipping</em></span> algorithm
	     [-CL:pec] uses this knowledge to initially trim back
	    ends of reads to an area without sequencing errors. However, if
	    reads covering a given area of DNA are present in only one
	    direction, then these reads will be completely eliminated.
	  </p><p>
	    If you use only artificial reads in an assembly, then switch off
	    the <span class="emphasis"><em>proposed end clipping</em></span>
	     [-CL:pec=no].
	  </p><p>
	    If you mix artificial reads with "normal" reads, make sure that
	    every part of an artificial read is covered by some other read in
	    reverse complement direction (be it a normal or artificial
	    read). The easiest way to do that is to add a reverse complement
	    for every artificial read yourself, though if you use an
	    overlapping strategy with artificial reads, you can calculate the
	    overlaps and reverse complements of reads so that every second
	    artificial read is in reverse complement to save time and memory
	    afterwards during the computation.
	  </p></li><li class="listitem"><p>
	    Sequencing type/technology: MIRA currently knows Sanger, 454, Ion
	    Torrent, Solexa, PacBioHQ/LQ and "Text" as sequencing
	    technologies, every read entered in an assembly must be one of
	    those.
	  </p><p>
	    Artificial reads should be classified depending on the data they
	    were created from, that is, Sanger for consensus of Sanger reads,
	    454 for consensus of 454 reads etc. However, should reads created
	    from Illumina consensus be much longer than, say, 200 or 300
	    bases, you should treat them as Sanger reads.
	  </p></li><li class="listitem"><p>
	    Quality values: be careful to assign decent quality values to your
	    artificial reads as several quality clipping or consensus calling
	    algorithms make extensive use of qualities. Pay attention to
	    values of [-CL:qc:bsqc] as well as to
	     [-CO:mrpg:mnq:mgqrt].
	  </p></li><li class="listitem"><p>
	    Read lengths: current maximum read length for MIRA is around
	    ~30kb. However, to account for some safety, MIRA currently allows
	    only 20kb reads as maximum length.
	  </p></li></ul></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_ploidy_and_repeats"></a>3.11.2. 
	Ploidy and repeats
      </h3></div></div></div><p>
	MIRA treats ploidy differences as repeats and will therefore build a
	separate contigs for the reads of a ploidy that has a difference to
	the other ploidy/ploidies.
      </p><p>
	There is simply no other way to handle ploidy while retaining the
	ability to separate repeats based on differences of only a single
	base. Everything else would be guesswork. I thought for some time
	about doing a coverage analysis around the potential repeat/ploidy
	site, but came to the conclusion that due to the stochastic nature of
	sequencing data, this would very probably take wrong decisions in too
	many cases to be acceptable.
      </p><p>
	If someone has a good idea, I'll be happy to hear it.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_handling_of_repeats"></a>3.11.3. 
	Handling of repeats
      </h3></div></div></div><p>
      </p><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_uniform_read_distribution"></a>3.11.3.1. 
	  Uniform read distribution
	</h4></div></div></div><p>
	  Under the assumption that reads in a project are uniformly
	  distributed across the genome, MIRA will enforce an average coverage
	  and temporarily reject reads from a contig when this average
	  coverage multiplied by a safety factor is reached at a given
	  site. This strategy reduces over-compression of repeats during the
	  contig building phase and keeps reads in reserve for other copies of
	  that repeat.
	</p><p>
	  It's generally a very useful tool disentangle repeats, but has some
	  slight secondary effects: rejection of otherwise perfectly good
	  reads. The assumption of read distribution uniformity is the big
	  problem we have here: of course it's not really valid. You sometimes
	  have less, and sometimes more than "the average"
	  coverage. Furthermore, the new sequencing technologies - 454 perhaps
	  but certainly the ones from Solexa - show that you also have a skew
	  towards the site of replication origin.
	</p><p>
	  Warning: Solexa data from late 2009 and 2010 show a high GC content
	  bias. This bias can reach 200 or 300%, i.e., sequence part for with
	  low GC
	</p><p>
	  One example: let's assume the average coverage of a project is 8 and
	  by chance at one place there 17 (non-repetitive) reads, then the
	  following happens:
	</p><p>
	  (Note: <span class="emphasis"><em>p</em></span> is the parameter  [-AS:urdsip])
	</p><p>
	  Pass 1 to <span class="emphasis"><em>p-1</em></span>: MIRA happily assembles everything together and calculates a
	  number of different things, amongst them an average coverage of ~8. At the
	  end of pass <span class="emphasis"><em>p-1</em></span>, it will announce this average coverage as first estimate
	  to the assembly process.
	</p><p>
	  Pass <span class="emphasis"><em>p</em></span>: MIRA has still assembled everything together, but at the end of each
	  pass the contig self-checking algorithms now include an "average coverage
	  check". They'll invariably find the 17 reads stacked and decide (looking at
	  the  [-AS:ardct] parameter which is assumed to be 2 for this example)
	  that 17 is larger than 2*8 and that this very well may be a repeat. The reads
	  get flagged as possible repeats.
	</p><p>
	  Pass <span class="emphasis"><em>p+1</em></span> to end: the "possibly repetitive" reads get a much tougher
	  treatment in MIRA. Amongst other things, when building the contig, the contig
	  now looks that "possibly repetitive" reads do not over-stack by an average
	  coverage multiplied by a safety value ( [-AS:urdcm]) which we'll
	  assume now to be 1.5 in this example. So, at a certain point, say when read 14
	  or 15 of that possible repeat want to be aligned to the contig at this given
	  place, the contig will just flatly refuse and tell the assembler to please
	  find another place for them, be it in this contig that is built or any other
	  that will follow. Of course, if the assembler cannot comply, the reads 14 to
	  17 will end up as contiglet (contig debris, if you want) or if it was only one
	  read that got rejected like this, it will end up as singlet or in the debris
	  file.
	</p><p>
	  Tough luck. I do have ideas on how to re-integrate those reads at the and of an
	  assembly, but I have deferred doing this as in every case I had looked up,
	  adding those reads to the contigs wouldn't have changed anything ... there's
	  already enough coverage.
	</p><p>
	  What should be done in those cases is simply filter away the contiglets
	  (defined as being of small size and having an average coverage below the
	  average coverage of the project divided 3 (or 2.5)) from a project.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_keeping_'long'_repetitive_contigs_separate"></a>3.11.3.2. 
	  Keeping 'long' repetitive contigs separate
	</h4></div></div></div><p>
	  MIRA had since 2.9.36 a feature to keep long repeats in separate
	  contigs. Due to algorithm changes, this feature is now standard. The
	  effect of this is that contigs with non-repetitive sequence will
	  stop at a 'long repeat' border which cannot be crossed by a single
	  read or by paired reads, including only the first few bases of the
	  repeat. Long repeats will be kept as separate contigs.
	</p><p>
	  This has been implemented to get a clean overview on which parts of
	  an assembly are 'safe' and which parts will be 'difficult'. For
	  this, the naming of the contigs has been extended: contigs named
	  with a '_c' at the end are contigs which contain mostly 'normal'
	  coverage. Contigs with "rep_c" are contigs which contain mostly
	  sequence classified as repetitive and which could not be assembled
	  together with a 'c' contig.
	</p><p>
	  The question remains: what are 'long' repeats? MIRA defines these as
	  repeats that are not spanned by any read that has non-repetitive
	  parts at the end. Basically -for shotgun assemblies - the mean
	  length of the reads that go into the assembly defines the minimum
	  length of 'long' repeats that have to be kept in separate contigs.
	</p><p>
	  It has to be noted that when using paired-end (or template)
	  sequencing, 'long' repeats which can be spanned by read-pairs (or
	  templates) are frequently integrated into 'normal' contigs as MIRA
	  can correctly place them most of the time.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_ref_helping_finishing_by_tagging_reads_with_haf_tags"></a>3.11.3.3. 
	  Helping finishing by tagging reads with HAF tags
	</h4></div></div></div><p>
	  HAF tags (HAsh Frequency) are set by MIRA when the option to colour reads by
	  kmer frequency ([-GE:crkf], on by default in most --job combinations)
	  is on. These tags show the status of k-mers (stretch of bases of given length
	  <span class="emphasis"><em>k</em></span>) in read sequences: whether MIRA recognised them as being present in
	  sub-average, average, above average or repetitive numbers.
	</p><p>
	  When using a finishing programs which can display tags in reads (and using the
	  proposed tag colour schemes for gap4 or consed, the assembly
	  will light up in colours ranging from light green to dark red, indicating
	  whether a certain part of the assembly is deemed non-repetitive to extremely
	  repetitive.
	</p><p>
	  One of the biggest advantages of the HAF tags is the implicit information they
	  convey on why the assembler stopped building a contig at an end.
	</p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	      if the read parts composing a contig end are mostly covered with HAF2
	      tags (below average frequency, coloured light-green), then one very probably
	      has a hole in the contig due to coverage problems which means there are no
	      or not enough reads covering a part of the sequence.
	    </p></li><li class="listitem"><p>
	      if the read parts composing a contig end are mostly covered with HAF3
	      tags (average frequency, coloured green), then you have an unusual situation
	      as this should only very rarely occur. The reason is that MIRA saw that
	      there are enough sequences which look the same as the one from your contig
	      end, but that these could not be joined. Likely reasons for this scenario
	      include non-random sequencing artifacts (seen in 454 data) or also
	      non-random chimeric reads (seen in Sanger and 454 data).
	    </p></li><li class="listitem"><p>
	      if the read parts composing a contig end are mostly covered with HAF4
	      tags (above average frequency, coloured yellow), then the assembler stopped
	      at grey zone of the coverage not being normal anymore, but not quite
	      repetitive yet. This can happen in cases where the read coverage is very
	      unevenly distributed across the project. The contig end in question might be
	      a repeat occurring two times in the sequence, but having less reads than
	      expected. Or it may be non-repetitive coverage with an unusual excess of
	      reads.
	    </p></li><li class="listitem"><p>
	      if the read parts composing a contig end are mostly covered with HAF5
	      (repeat, coloured red), HAF6 (heavy repeat, coloured darker red) and HAF7
	      tags (crazy repeat, coloured very dark red), then there is a repetitive area
	      in the sequence which could not be uniquely bridged by the reads present in
	      the assembly.
	    </p></li></ul></div><p>
	</p><p>
	  This information can be especially helpful when joining reads by hand in a
	  finishing program. The following list gives you a short guide to cases which
	  are most likely to occur and what you should do.
	</p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	      the proposed join involves contig ends mostly covered by HAF2
	      tags. Joining these contigs is probably a safe bet. The assembly may have
	      missed this join because of too many errors in the read ends or because
	      sequence having been clipped away which could be useful to join contigs.
	      Just check whether the join seems sensible, then join.
	    </p></li><li class="listitem"><p>
	      the proposed join involves contig ends mostly covered by HAF3
	      tags. Joining these contigs is probably a safe bet. The assembly may have
	      missed this join because of several similar chimeric reads reads or reads
	      with similar, severe sequencing errors covering the same spot.
	      Just check whether the join seems sensible, then join.
	    </p></li><li class="listitem"><p>
	      the proposed join involves contig ends mostly covered by HAF4
	      tags. Joining these contigs should be done with some caution, it
	      may be a repeat occurring twice in the sequence.  Check whether
	      the contig ends in question align with ends of several other
	      contigs. If not, joining is probably the way to go. If potential
	      joins exist with several other contigs, then it's a repeat (see
	      below).
	    </p></li><li class="listitem"><p>
	      the proposed join involves contig ends mostly covered by HAF5, HAF6 or
	      HAF7 tags. Joining these contigs should be done with utmost caution, you are
	      almost certainly (HAF5) and very certainly (HAF6 and HAF7) in a repetitive
	      area of your sequence.
	      You will probably need additional information like paired-end or template
	      info in order join your contigs.
	    </p></li></ul></div><p>
	</p></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_consensus_in_finishing_programs_gap4_consed_"></a>3.11.4. 
	Consensus in finishing programs (gap4, consed, ...)
      </h3></div></div></div><p>
	MIRA goes a long way to calculate a consensus which is as correct as
	possible. Unfortunately, communication with finishing programs is a bit
	problematic as there currently is no standard way to say which reads are from
	which sequencing technology.
      </p><p>
	It is therefore often the case that finishing programs calculate an own
	consensus when loading a project assembled with MIRA. This is the case for at
	least, e.g., gap4. This consensus may then not be optimal.
      </p><p>
	The recommended way to deal with this problem is: import the results from MIRA
	into your finishing program like you always do. Then finish the genome there,
	export the project from the finishing program as CAF and finally use
	miraconvert (from the MIRA package ) with the "-r" option to
	recalculate the optimal consensus of your finished project.
      </p><p>
	E.g., assuming you have just finished editing the gap4 database
	<code class="filename">DEMO.3</code>, do the following. First, export the gap4 database back to
	CAF:
      </p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>gap2caf -project DEMO -version 3 &gt;demo3.caf</code></strong></pre><p>
      </p><p>
	Then, use<span class="command"><strong>miraconvert</strong></span> <span class="emphasis"><em>with</em></span> <span class="emphasis"><em>option</em></span> <span class="emphasis"><em>'-r'</em></span> to
	convert it into any other format that you need. Example for converting to a
	CAF and a FASTA format with correct consensus:
      </p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>miraconvert -t caf -t fasta -r c demo3.caf final_result</code></strong></pre><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_some_other_things_to_consider"></a>3.11.5. 
	Some other things to consider
      </h3></div></div></div><p>
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    MIRA cannot work with EXP files resulting from GAP4 that already
	    have been edited. If you want to reassemble an edited GAP4 project, convert
	    it to CAF format and use the [-caf] option to load.
	  </p></li><li class="listitem"><p>
	    As also explained earlier, MIRA relies on sequencing vector being
	    recognised in preprocessing steps by other programs. Sometimes, when a whole
	    stretch of bases is not correctly marked as sequencing vector, the reads
	    might not be aligned into a contig although they might otherwise match quite
	    perfectly. You can use [-CL:pvc] and  [-CO:emea] to address
	    problem with incomplete clipping of sequencing vectors. Also having the
	    assembler work with less strict parameters may help out of this.
	  </p></li><li class="listitem"><p>
	    MIRA has been developed to assemble shotgun sequencing or EST
	    sequencing data. There are no explicit limitations concerning length or
	    number of sequences. However, there are a few implicit assumptions that were
	    made while writing portions of the code:
	  </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
		Problems which might arise with 'unnatural' long sequence
		reads: my implementation of the Smith-Waterman alignment
		routines. I use a banded version with linear running time
		(linear to the bandwidth) but quadratic space usage. So,
		comparing two 'reads' of length 5000 will result in memory
		usage of 95 MiB, two reads with 50000 bases will need 9.5 GiB.
	      </p><p>
		This problem has become acute now with PacBio, I'm working on
		it. In the mean time, current usable sequence length of PacBio
		are more in the 3 to 4 kilobase range, with only a few reads
		attaining or surpassing 20 kb. So Todays' machines should
		still be able to handle the problem more or less effortlessly.
	      </p></li><li class="listitem"><p>
		32 bit versions of MIRA are not supported anymore.
	      </p></li><li class="listitem"><p>
		to reduce memory overhead, the following assumptions have been made:
	      </p></li><li class="listitem"><p>
		MIRA is not fully multi-threaded (yet), though most
		bottlenecks are now in code areas which cannot be
		multi-threaded by algorithm design.
	      </p></li></ol></div></li><li class="listitem"><p>
	    a project does not contain sequences from more than 255 different:
	  </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: circle; "><li class="listitem"><p>
		sequencing machine types
	      </p></li><li class="listitem"><p>
		primers
	      </p></li><li class="listitem"><p>
		strains (in mapping mode: 7)
	      </p></li><li class="listitem"><p>
		base callers
	      </p></li><li class="listitem"><p>
		dyes
	      </p></li><li class="listitem"><p>
		process status
	      </p></li></ul></div></li><li class="listitem"><p>
	    a project does not contain sequences from more than 65535 different
	  </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: circle; "><li class="listitem"><p>
		clone vectors
	      </p></li><li class="listitem"><p>
		sequencing vectors
	      </p></li></ul></div></li></ul></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_things_you_should_not_do"></a>3.12. 
      Things you should not do
    </h2></div></div></div><p>
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_never_on_nfs"></a>3.12.1. 
	Do not run MIRA on NFS mounted directories without redirecting the tmp directory
      </h3></div></div></div><p>
	Of course one can run MIRA atop a NFS mount (a "disk" mounted over a
	network using the NFS protocol), but the performance will go down the
	drain as the NFS server respectively the network will not be able to
	cope with the amount of data MIRA needs to shift to and from disk
	(writes/reads to the tmp directory). Slowdowns of a factor of 10 and
	more have been observed. In case you have no other possibility, you
	can force MIRA to run atop a NFS using [-NW:cnfs=warn]
	( [-NW:cnfs=no]), but you have been warned.
      </p><p>
	In case you want to keep input and output files on NFS, you can use
	[-DI:trt] to redirect the tmp directory to a local
	filesystem. Then MIRA will run at almost full speed.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_never_without_quality_values"></a>3.12.2. 
	Do not assemble without quality values
      </h3></div></div></div><p>
	Assembling sequences without quality values is like ... like ... like
	driving a car downhill a sinuous mountain road with no rails at 200
	km/h without brakes, airbags and no steering wheel. With a ravine on
	one side and a rock face on the other. Did I mention the missing
	seat-belts? You <span class="emphasis"><em>might</em></span> get down safely, but
	experience tells the result will rather be a bloody mess.
      </p><p>
	Well, assembling without quality values is a bit like above, but
	bloodier. And the worst: you (or the people using the results of such
	an assembly) will notice the gore only until it is way too late and
	money has been sunk in follow-up experiments based on wrong data.
      </p><p>
	All MIRA routines internally are geared toward quality values guiding
	decisions. No one should ever assembly anything without quality
	values. Never. Ever. Even if quality values are sometimes inaccurate,
	they do help.
      </p><p>
	Now, there are <span class="bold"><strong>very rare occasions</strong></span>
	where getting quality values is not possible. If you absolutely cannot
	get them, and I mean only in this case, use the following
	switch:<code class="literal">--noqualities[=SEQUENCINGTECHNOLOGY]</code> and
	additionally give a default quality for reads of a readgroup. E.g.:
      </p><pre class="screen">parameters= --noqualities=454

readgroup
technology=454
data=...
default_qual=30</pre><p>
	This tells MIRA not to complain about missing quality values and to
	fake a quality value of 30 for all reads (of a readgroup) having no
	qualities, allowing some MIRA routines (in standard parameter
	settings) to start disentangling your repeats.
      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	Doing the above has some severe side-effects. You will be, e.g., at
	the mercy of non-random sequencing errors. I suggest combining the
	above with a [-CO:mrpg=4] or higher. You also may want to
	tune the default quality parameter together with  [-CO:mnq]
	and  [-CO:mgqrt] in cases where you mix sequences with and
	without quality values.
      </td></tr></table></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_useful_third_party_programs"></a>3.13. 
      Useful third party programs
    </h2></div></div></div><p>
      Viewing the results of a MIRA assembly or preprocessing the sequences
      for an assembly can be done with a number of different programs. The
      following ones are are just examples, there are a lot more packages
      available:
    </p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	  HTML browser
	</span></dt><dd><p> If you have really nothing else as viewer, a browser who
	  understands tables is needed to view the HTML output. A browser knowing
	  style sheets (CSS) is recommended, as different tags will be highlighted.
	  Konqueror, Opera, Mozilla, Netscape and Internet Explorer all do fine, lynx
	  is not really ...  optimal.
	  </p></dd><dt><span class="term">
	  Assembly viewer / finishing / preprocessing
	</span></dt><dd><p>
	    You'll want GAP4 or its successor GAP5 (generally speaking: the
	    Staden package) to preprocess the sequences, visualise and
	    eventually rework the results when using gap4da output. The Staden
	    package comes with a fully featured sequence preparing and
	    annotating engine (pregap4) that is very useful to preprocess your
	    Sanger data (conversion between file types, quality clipping,
	    tagging etc.).
	  </p><p>
	    See <a class="ulink" href="http://www.sourceforge.net/projects/staden/" target="_top">http://www.sourceforge.net/projects/staden/</a> for
	    further information and also a possibility to download precompiled
	    binaries for different platforms.
	  </p></dd><dt><span class="term">
	  Vector screening
	</span></dt><dd><p>
	    Reading result files from <span class="command"><strong>ssaha2</strong></span> or
	    <span class="command"><strong>smalt</strong></span> from the Sanger Centre is supported
	    directly by MIRA to perform a fast and efficient tagging of
	    sequencing vector stretches. This makes you basically independent
	    from any other commercial or license-requiring vector screening
	    software.  For Sanger reads, a combination of
	    <span class="command"><strong>lucy</strong></span> (see below), <span class="command"><strong>ssaha2</strong></span> or
	    <span class="command"><strong>smalt</strong></span> together with the MIRA parameters for
	    SSAHA2 / SMALT support (see all  [-CL:msvs*] parameters) and quality clipping
	    ( [-CL:qc]) should do the trick. For reads coming from 454
	    pyro-sequencing, <span class="command"><strong>ssaha2</strong></span> or
	    <span class="command"><strong>smalt</strong></span> and the SSAHA2 / SMALT support also work
	    pretty well.
	  </p><p>
	    See
	    <a class="ulink" href="http://www.sanger.ac.uk/resources/software/ssaha2/" target="_top">http://www.sanger.ac.uk/resources/software/ssaha2/</a>
	    and / or <a class="ulink" href="http://www.sanger.ac.uk/resources/software/smalt/" target="_top">http://www.sanger.ac.uk/resources/software/smalt/</a> for
	    further information and also a possibility to download the source
	    or precompiled binaries for different platforms.
	  </p></dd><dt><span class="term">
	  Preprocessing
	</span></dt><dd><p> <span class="command"><strong>lucy</strong></span> from TIGR (now JCVI) is another
	  useful sequence preprocessing program for Sanger data. Lucy is a
	  utility that prepares raw DNA sequence fragments for sequence
	  assembly. The cleanup process includes quality assessment,
	  confidence reassurance, vector trimming and vector removal.
	  </p><p>
	    There's a small script in the MIRA 3rd party package which
	    converts the clipping data from the lucy format into something
	    MIRA can understand (NCBI Traceinfo).
	  </p><p>
	    See <a class="ulink" href="ftp://ftp.tigr.org/pub/software/Lucy/" target="_top">ftp://ftp.tigr.org/pub/software/Lucy/</a> to download the source code
	    of lucy.
	  </p></dd><dt><span class="term">
	  Assembly viewer
	</span></dt><dd><p> Viewing <code class="filename">.ace</code> file output without consed
	  can be done with clview from TIGR. See
	  <a class="ulink" href="http://www.tigr.org/tdb/tgi/software/" target="_top">http://www.tigr.org/tdb/tgi/software/</a>.
	  </p><p>
	    A better alternative is Tablet <a class="ulink" href="http://bioinf.scri.ac.uk/tablet/" target="_top">http://bioinf.scri.ac.uk/tablet/</a> which also reads SAM
	    format.
	  </p></dd><dt><span class="term">
	  Assembly coverage analysis
	</span></dt><dd><p>
	    The Integrated Genome Browser (IGB) of the GenoViz project at
	    SourceForge (<a class="ulink" href="http://sourceforge.net/projects/genoviz/" target="_top">http://sourceforge.net/projects/genoviz/</a>) is just perfect
	    for loading a genome and looking at mapping coverage (provided by
	    the wiggle result files of MIRA).
	  </p></dd><dt><span class="term">
	  Preprocessing (base calling)
	</span></dt><dd><p>
	    TraceTuner (<a class="ulink" href="http://sourceforge.net/projects/tracetuner/" target="_top">http://sourceforge.net/projects/tracetuner/</a>) is a tool for
	    base and quality calling of trace files from DNA sequencing
	    instruments. Originally developed by Paracel, this code base was
	    released as open source in 2006 by Celera.
	  </p></dd><dt><span class="term">
	  Preprocessing / viewing
	</span></dt><dd><p> phred (basecaller) - cross_match (sequence comparison and
	  filtering) - phrap (assembler) - consed (assembly viewer and
	  editor). This is another package that can be used for this type of
	  job, but requires more programming work. The fact that sequence
	  stretches are masked out (overwritten with the character X) if they
	  shouldn't be used in an assembly doesn't really help and is
	  considered harmful (but it works).
	  </p><p>
	    Note the bug of consed when reading ACE files, see more about this
	    in the section on file types (above) in the entry for ACE.
	  </p><p>
	    See <a class="ulink" href="http://www.phrap.org/" target="_top">http://www.phrap.org/</a> for further information.
	  </p></dd><dt><span class="term">
	  text viewer
	</span></dt><dd><p> A text viewer for the different textual output files.
	  </p></dd></dl></div><p>
      As always, most of the time a combination of several different packages
      is possible. My currently preferred combo for genome projects is
      <span class="command"><strong>ssaha2</strong></span> or <span class="command"><strong>smalt</strong></span> and or
      <span class="command"><strong>lucy</strong></span> (vector screening), MIRA (assembly, of course)
      and gap4 (assembly viewing and finishing).
    </p><p>
      For re-assembling projects that were edited in gap4, one will also need
      the gap2caf converter. The source for this is available at
      <a class="ulink" href="http://www.sanger.ac.uk/resources/software/caf.html" target="_top">http://www.sanger.ac.uk/resources/software/caf.html</a>.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_speed_and_memory_considerations"></a>3.14. 
      Speed and memory considerations
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_memory"></a>3.14.1. 
	Estimating needed memory for an assembly project
      </h3></div></div></div><p>
	Since the V2.9.24x3 version of MIRA, there is <span class="command"><strong>miramem</strong></span> as
	program call. When called from the command line, it will ask a number of
	questions and then print out an estimate of the amount of RAM needed to
	assemble the project. Take this estimate with a grain of salt, depending on
	the sequences properties, variations in the estimate can be +/- 30% for
	bacteria and 'simple' eukaryotes. The higher the number of repeats is, the
	more likely you will need to restrict memory usage in some way or another.
      </p><p>
	Here's the transcript of a session with miramem:
      </p><pre class="screen">
This is MIRA V3.2.0rc1 (development version).

Please cite: Chevreux, B., Wetter, T. and Suhai, S. (1999), Genome Sequence
Assembly Using Trace Signals and Additional Sequence Information.
Computer Science and Biology: Proceedings of the German Conference on
Bioinformatics (GCB) 99, pp. 45-56.

To (un-)subscribe the MIRA mailing lists, see:
        http://www.chevreux.org/mira_mailinglists.html

After subscribing, mail general questions to the MIRA talk mailing list:
        mira_talk@freelists.org

To report bugs or ask for features, please use the SourceForge ticketing
system at:
        http://sourceforge.net/p/mira-assembler/tickets/
This ensures that requests do not get lost.

[...]

miraMEM helps you to estimate the memory needed to assemble a project.
Please answer the questions below.

Defaults are give in square brackets and chosen if you just press return.
Hint: you can add k/m/g modifiers to your numbers to say kilo, mega or giga.

Is it a genome or transcript (EST/tag/etc.) project? (g/e/) [g]
g
Size of genome? [4.5m] <strong class="userinput"><code>9.8m</code></strong>
9800000
Size of largest chromosome? [9800000]
9800000
Is it a denovo or mapping assembly? (d/m/) [d]
d
Number of Sanger reads? [0]
0
Are there 454 reads? (y/n/) [n] <strong class="userinput"><code>y</code></strong>
y
Number of 454 GS20 reads? [0]
0
Number of 454 FLX reads? [0]
0
Number of 454 Titanium reads? [0] <strong class="userinput"><code>750k</code></strong>
750000
Are there PacBio reads? (y/n/) [n]
n
Are there Solexa reads? (y/n/) [n]
n


************************* Estimates *************************

The contigs will have an average coverage of ~ 30.6 (+/- 10%)

RAM estimates:
           reads+contigs (unavoidable): 7.0 GiB
                large tables (tunable): 688. MiB
                                        ---------
                          total (peak): 7.7 GiB

            add if using -CL:pvlc=yes : 2.6 GiB

Estimates may be way off for pathological cases.

Note that some algorithms might try to grab more memory if
the need arises and the system has enough RAM. The options
for automatic memory management control this:
  -AS:amm, -AS:kpmf, -AS:mps
Further switches that might reduce RAM (at cost of run time
or accuracy):
  -SK:mkim, -SK:mchr (both runtime); -SK:mhpr (accuracy)
*************************************************************</pre><p>
	If your RAM is not large enough, you can still assemble projects by
	using disk swap. Up to 20% of the needed memory can be provided by
	swap without the speed penalty getting too large. Going above 20% is
	not recommended though, above 30% the machine will be almost
	permanently swapping at some point or another.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_ref_speed"></a>3.14.2. 
	Some numbers on speed
      </h3></div></div></div><p>
	To be rewritten for MIRA4.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_known_problems_bugs"></a>3.15. 
      Known Problems / Bugs
    </h2></div></div></div><p>
      File Input / Output:
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  MIRA can only read unedited EXP files.
	</p></li><li class="listitem"><p>
	  There sometimes is a (rather important) memory leak occurring while
	  using the assembly integrated Sanger read editor. I have not been
	  able to trace the reason yet.
	</p></li></ol></div><p>
    </p><p>
      Assembly process:
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  The routines for determining <span class="emphasis"><em>Repeat Marker
	  Bases</em></span> (SRMr) are sometimes too sensitive, which sometimes
	  leads to excessive base tagging and preventing right assemblies in
	  subsequent assembly processes. The parameters you should look at for
	  this problem are
	   [-CO:mrc:nrz:mgqrt:mgqwpc]. Also look at  [-CL:pvc] and
	   [-CO:emea] if you have a lot of sequencing vector relics at the
	  end of the sequences.
	</p></li></ol></div><p>
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_todos"></a>3.16. 
      TODOs
    </h2></div></div></div><p>
      These are some of the topics on my TODO list for the next revisions to
      come:
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  Making Smith-Waterman parts of the process multi-threaded or use SIMD
	  (currently stopped due to other priorities like PacBio etc.)
	</p></li></ol></div><p>
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_working_principles"></a>3.17. 
      Working principles
    </h2></div></div></div><p>
      Note: description is old and needs to be adapted to the current 4.x line
      of MIRA.
    </p><p>
      To avoid the "garbage-in, garbage-out" problematic, MIRA uses a 'high
      quality alignments first' contig building strategy. This means that the
      assembler will start with those regions of sequences that have been
      marked as good quality (high confidence region - HCR) with low error
      probabilities (the clipping must have been done by the base caller or
      other preprocessing programs, e.g. pregap4) and then gradually extends
      the alignments as errors in different reads are resolved through error
      hypothesis verification and signal analysis.
    </p><p>
      This assembly approach relies on some of the automatic editing
      functionality provided by the EdIt package which has been integrated in
      parts within MIRA.
    </p><p>
      This is an approximate overview on the steps that are executed while
      assembling:
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  All the experiment / phd / fasta sequences that act as input are
	  loaded (or the CAF project). Qualities for the bases are loaded from
	  the FASTA or SCF if needed.
	</p></li><li class="listitem"><p>
	  the ends of the reads are cleaned ensure they have a minimum stretch
	  of bases without sequencing errors
	</p></li><li class="listitem"><p>
	  The high confidence region (HCR) of each read is compared with a
	  quick algorithm to the HCR of every other read to see if it could
	  match and have overlapping parts (this is the 'SKIM' filter).
	</p></li><li class="listitem"><p>
	  All the reads which could match are being checked with an adapted
	  Smith-Waterman alignment algorithm (banded version). Obvious
	  mismatches are rejected, the accepted alignments form one or several
	  alignment graphs.
	</p></li><li class="listitem"><p>
	  Optional pre-assembly read extension step: MIRA tries to extend HCR
	  of reads by analysing the read pairs from the previous
	  alignment. This is a bit shaky as reads in this step have not been
	  edited yet, but it can help. Go back to step 2.
	</p></li><li class="listitem"><p>
	  A contig gets made by building a preliminary partial path through
	  the alignment graph (through in-depth analysis up to a given level)
	  and then adding the most probable overlap candidates to a given
	  contig. Contigs may reject reads if these introduce to many errors
	  in the existing consensus. Errors in regions known as dangerous
	  (for the time being only ALUS and REPT) get additional attention by
	  performing simple signal analysis when alignment discrepancies
	  occur.
	</p></li><li class="listitem"><p>
	  Optional: the contig can be analysed and corrected by the automatic
	  editor ("EdIt" for Sanger reads, or the new MIRA editor for 454
	  reads).
	</p></li><li class="listitem"><p>
	  Long repeats are searched for, bases in reads of different repeats
	  that have been assembled together but differ sufficiently (for EdIT
	  so that they didn't get edited and by phred quality value) get
	  tagged with special tags (SRMr and WRMr).
	</p></li><li class="listitem"><p>
	  Go back to step 5 if there are reads present that have not been
	  assembled into contigs.
	</p></li><li class="listitem"><p>
	  Optional: Detection of spoiler reads that prevent joining of
	  contigs. Remedy by shortening them.
	</p></li><li class="listitem"><p>
	  Optional: Write out a checkpoint assembly file and go back to step 2.
	</p></li><li class="listitem"><p>
	  The resulting project is written out to different output files and
	  directories.
	</p></li></ol></div><p>
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_ref_see_also"></a>3.18. 
      See Also
    </h2></div></div></div><p>
      The other MIRA manuals and walkthroughs as well as
      <span class="command"><strong>EdIt</strong></span>, <span class="command"><strong>gap4</strong></span>,
      <span class="command"><strong>pregap4</strong></span>, <span class="command"><strong>gap5</strong></span>,
      <span class="command"><strong>clview</strong></span>, <span class="command"><strong>caf2gap</strong></span>,
      <span class="command"><strong>gap2caf</strong></span>, <span class="command"><strong>ssaha2</strong></span>,
      <span class="command"><strong>smalt</strong></span>, <span class="command"><strong>compress</strong></span> and
      <span class="command"><strong>gzip</strong></span>, <span class="command"><strong>cap3</strong></span>,
      <span class="command"><strong>ttuner</strong></span>, <span class="command"><strong>phred</strong></span>,
      <span class="command"><strong>phrap</strong></span>, <span class="command"><strong>cross_match</strong></span>,
      <span class="command"><strong>consed</strong></span>.
    </p></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_dataprep"></a>Chapter 4. Preparing data</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_pd_introduction">4.1. 
      Introduction
    </a></span></dt><dt><span class="sect1"><a href="#sect_pd_sanger">4.2. 
      Sanger
    </a></span></dt><dt><span class="sect1"><a href="#sect_pd_454">4.3. 
      Roche / 454
    </a></span></dt><dt><span class="sect1"><a href="#sect_pd_illumina">4.4. 
      Illumina
    </a></span></dt><dt><span class="sect1"><a href="#sect_pd_pacbio">4.5. 
      Pacific Biosciences
    </a></span></dt><dt><span class="sect1"><a href="#sect_pd_iontor">4.6. 
      Ion Torrent
    </a></span></dt><dt><span class="sect1"><a href="#sect_pd_sra">4.7. 
      Short Read Archive (SRA)
    </a></span></dt></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">Rome didn't fall in a day either.</span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_pd_introduction"></a>4.1. 
      Introduction
    </h2></div></div></div><p>
      Most of this chapter and many sections are just stubs at the moment.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_pd_sanger"></a>4.2. 
      Sanger
    </h2></div></div></div><p>
      Outside MIRA: transform .ab1 to .scf, perform sequencing vector clip
      (and cloning vector clip if used), basic quality clips.
    </p><p>
      Recommended program: <span class="command"><strong>gap4</strong></span> (or
      rather <span class="command"><strong>pregap4</strong></span>) from the Staden 4 package.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_pd_454"></a>4.3. 
      Roche / 454
    </h2></div></div></div><p>
      Outside MIRA: convert SFF instrument from Roche to FASTQ,
      use <span class="command"><strong>sff_extract</strong></span> for that. In case you used
      "non-standard" sequencing procedures: clip away MIDs, clip away
      non-standard sequencing adaptors used in that project.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_pd_illumina"></a>4.4. 
      Illumina
    </h2></div></div></div><p>
      Outside MIRA: for heavens' sake: do NOT try to clip or trim by quality
      yourself. Do NOT try to remove standard sequencing adaptors
      yourself. Just leave Illumina data alone! (really, I mean it).
    </p><p>
      MIRA is much, much better at that job than you will probably ever be
      ... and I dare to say that MIRA is better at that job than 99% of all
      clipping/trimming software existing out there. Just make sure you use
      the [-CL:pec] (proposed_end_clip) option of MIRA.
    </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      The <span class="emphasis"><em>only</em></span> exception to the above is if you (or your
      sequencing provider) used decidedly non-standard sequencing
      adaptors. Then it might be worthwhile to perform own adaptor
      clipping. But this will not be the case for 99% of all sequencing
      projects out there.
    </td></tr></table></div><p>
      Joining paired-ends: if you want to do this, feel free to use any tool
      which is out there (TODO: quick list). Just make sure they do not join
      on very short overlaps. For me, the minimum overlap is at least 17
      bases, but I more commonly use at least 30.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_pd_pacbio"></a>4.5. 
      Pacific Biosciences
    </h2></div></div></div><p>
      Outside MIRA: MIRA needs error corrected reads, either
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem">
	PacBio CCS reads (circular consensus sequence) which you get from the
	PacBio SMRTAnalysis pipeline
      </li><li class="listitem">
	or self-corrected or reads corrected with other sequencing
	technologies which you will get either from the PacBio HGAP pipeline
	or the pacbioToCA pipeline
      </li></ul></div><p>
      Assembly of uncorrected PacBio reads (CLR) is currently not supported
      officially as of MIRA 4.0.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_pd_iontor"></a>4.6. 
      Ion Torrent
    </h2></div></div></div><p>
      Outside MIRA: need to convert BAM to FASTQ. Need to clip away
      non-standard sequencing adaptors if used in that project. Apart from
      that: leave the data alone.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_pd_sra"></a>4.7. 
      Short Read Archive (SRA)
    </h2></div></div></div><p>
      Outside MIRA: you need to convert SRA format to FASTQ format. This is done
      using <span class="command"><strong>fastq-dump</strong></span> from the SRA toolkit from the
      NCBI. Make sure to have at least version 2.4.x of the toolkit. Last time
      I looked (March 2015), the software was at
      <a class="ulink" href="http://www.ncbi.nlm.nih.gov/Traces/sra/?view=software" target="_top">http://www.ncbi.nlm.nih.gov/Traces/sra/?view=software</a>, the
      documentation for the whole toolkit was at
      <a class="ulink" href="http://www.ncbi.nlm.nih.gov/Traces/sra/?view=toolkit_doc" target="_top">http://www.ncbi.nlm.nih.gov/Traces/sra/?view=toolkit_doc</a>,
      and for <span class="command"><strong>fastq-dump</strong></span> it was
      <a class="ulink" href="http://www.ncbi.nlm.nih.gov/Traces/sra/?view=toolkit_doc&amp;f=fastq-dump" target="_top">http://www.ncbi.nlm.nih.gov/Traces/sra/?view=toolkit_doc&amp;f=fastq-dump</a>
    </p><p>
      After extraction, proceed with preprocessing as described above,
      depending on the sequencing technology used.
    </p><p>
      For extracting Illumina data, use something like this:
    </p><pre class="screen"><code class="prompt">arcadia:/some/path$</code> <strong class="userinput"><code>fastq-dump -I --split-files <em class="replaceable"><code>somefile.sra</code></em></code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	As <span class="command"><strong>fastq-dump</strong></span> unfortunately uses a pretty wasteful
	variant of the FASTQ format, you might want to reduce the file size
	for each FASTQ it produces by doing this:
      </p><pre class="screen"><strong class="userinput"><code>sed -i '3~4 s/^+.*$/+/' <em class="replaceable"><code>file.fastq</code></em></code></strong></pre><p>
	The above command performs an in-file replacement of unnecessary name
	and comments on the quality divider lines of the FASTQ. The exact
	translation of the <span class="command"><strong>sed</strong></span> is: do an in-file
	replacement (-i); starting on the third line, then every fourth line
	(3~4); substitute (s/); a line which starts (^); with a plus (+); and
	then can have any character (.); repeated any number of times
	including zero (*); until the end of the line ($); by just a single
	plus character (/+/).
      </p><p>
	This alone reduces the file size of a typical Illumina data set with
	100mers extracted from the SRA by about 15 to 20%.
      </p></td></tr></table></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_denovo"></a>Chapter 5. De-novo assemblies</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_dn_introduction">5.1. 
      Introduction
    </a></span></dt><dt><span class="sect1"><a href="#sect_dn_general">5.2. 
      General steps
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_dn_ge_copying_and_naming_the_sequence_data">5.2.1. 
	Copying and naming the sequence data
      </a></span></dt><dt><span class="sect2"><a href="#sect_dn_ge_writing_a_simple_manifest_file">5.2.2. 
	Writing a simple manifest file
      </a></span></dt><dt><span class="sect2"><a href="#sect_dn_ge_starting_assembly">5.2.3. Starting the assembly</a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_dn_manifest_files_use_cases">5.3. 
      Manifest files for different use cases
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_dn_mf_denovo_with_shotgun_data">5.3.1. 
	Manifest for shotgun data
      </a></span></dt><dt><span class="sect2"><a href="#sect_dn_mf_assembling_with_multiple_technologies">5.3.2. 
	Assembling with multiple sequencing technologies (hybrid assemblies)
      </a></span></dt><dt><span class="sect2"><a href="#sect_dn_mf_manifest_for_pairedend_data">5.3.3. 
	Manifest for data sets with paired reads
      </a></span></dt><dt><span class="sect2"><a href="#sect_dn_mf_denovo_with_multiple_strains">5.3.4. 
	De-novo with multiple strains
      </a></span></dt></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">The universe is full of surprises - most of them nasty.</span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_dn_introduction"></a>5.1. 
      Introduction
    </h2></div></div></div><p>
      This guide assumes that you have basic working knowledge of Unix systems, know
      the basic principles of sequencing (and sequence assembly) and what assemblers
      do.
    </p><p>
      While there are step by step instructions on how to setup your data and
      then perform an assembly, this guide expects you to read at some point in time
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	  Before the assembly, <a class="xref" href="#chap_dataprep" title="Chapter 4. Preparing data">Chapter 4: &#8220;<i>Preparing data</i>&#8221;</a> to know what to do (or not to
	  do) with the sequencing data before giving it to MIRA.
	</p></li><li class="listitem"><p>
	  For users with PacBio reads, <a class="xref" href="#sect_sp_pacbio_ccs" title="8.2.1.  PacBio CCS reads">Section 8.2.1: &#8220;
	PacBio CCS reads
      &#8221;</a> has important
	  information regarding special parameters needed.
	</p></li><li class="listitem"><p>
	  After the assembly, <a class="xref" href="#chap_results" title="Chapter 9. Working with the results of MIRA">Chapter 9: &#8220;<i>Working with the results of MIRA</i>&#8221;</a> to know what to do with the
	  results of the assembly. More specifically, <a class="xref" href="#sect_res_looking_at_results" title="9.1.  MIRA output directories and files">Section 9.1: &#8220;
      MIRA output directories and files
    &#8221;</a>, <a class="xref" href="#sect_res_first_look:the_assembly_info" title="9.2.  First look: the assembly info">Section 9.2: &#8220;
      First look: the assembly info
    &#8221;</a>, <a class="xref" href="#sect_res_converting_results" title="9.3.  Converting results">Section 9.3: &#8220;
      Converting results
    &#8221;</a>, <a class="xref" href="#sect_res_filtering_of_results" title="9.4.  Filtering results">Section 9.4: &#8220;
      Filtering results
    &#8221;</a> and <a class="xref" href="#sect_res_places_of_importance_in_a_de_novo_assembly" title="9.5.  Places of importance in a de-novo assembly">Section 9.5: &#8220;
      Places of importance in a de-novo assembly
    &#8221;</a>.
	</p></li><li class="listitem"><p>
	  And also <a class="xref" href="#chap_reference" title="Chapter 3. MIRA 4 reference manual">Chapter 3: &#8220;<i>MIRA 4 reference manual</i>&#8221;</a> to look up how manifest files should be
	  written (<a class="xref" href="#sect_ref_manifest_basics" title="3.4.2.  The manifest file: basics">Section 3.4.2: &#8220;
	The manifest file: basics
      &#8221;</a> and <a class="xref" href="#sect_ref_manifest_readgroups" title="3.4.3.  The manifest file: information on the data you have">Section 3.4.3: &#8220;
	The manifest file: information on the data you have
      &#8221;</a> and <a class="xref" href="#sect_ref_manifest_parameters" title="3.4.4.  The manifest file: extended parameters">Section 3.4.4: &#8220;
	The manifest file: extended parameters
      &#8221;</a>), some command line options as well as general information on
	  what tags MIRA uses in assemblies, files it generates etc.pp
	</p></li><li class="listitem"><p>
	  Last but not least, you may be interested in some observations about
	  the different sequencing technologies and the traps they may
	  contain, see <a class="xref" href="#chap_seqtechdesc" title="Chapter 12. Description of sequencing technologies">Chapter 12: &#8220;<i>Description of sequencing technologies</i>&#8221;</a> for that. For advice on what to pay
	  attention to <span class="emphasis"><em>before</em></span> going into a sequencing
	  project, have a look at <a class="xref" href="#chap_seqadvice" title="Chapter 13. Some advice when going into a sequencing project">Chapter 13: &#8220;<i>Some advice when going into a sequencing project</i>&#8221;</a>.
	</p></li></ul></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_dn_general"></a>5.2. 
      General steps
    </h2></div></div></div><p>
      This part will introduce you step by step how to get your data together
      for a simple mapping assembly. I'll make up an example using an
      imaginary bacterium: <span class="emphasis"><em>Bacillus chocorafoliensis</em></span> (or
      short: <span class="emphasis"><em>Bchoc</em></span>). You collected the strain you want to
      assemble somewhere in the wild, so you gave the strain the name
      <span class="emphasis"><em>Bchoc_wt</em></span>.
    </p><p>
      Just for laughs, let's assume you sequenced that bug with lots of more
      or less current sequencing technologies: Sanger, 454, Illumina, Ion
      Torrent and Pacific Biosciences.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_dn_ge_copying_and_naming_the_sequence_data"></a>5.2.1. 
	Copying and naming the sequence data
      </h3></div></div></div><p>
	You need to create (or get from your sequencing provider) the
	sequencing data in any supported file format. Amongst these, FASTQ and
	FASTA + FASTA-quality will be the most common, although the latter is
	well on the way out nowadays. The following walkthrough uses what most
	people nowadays get: FASTQ.
      </p><p>
	Create a new project directory (e.g. <code class="filename">myProject</code>)
	and a subdirectory of this which will hold the sequencing data
	(e.g. <code class="filename">data</code>).
      </p><pre class="screen"><code class="prompt">arcadia:/path/to</code> <strong class="userinput"><code>mkdir myProject</code></strong>
<code class="prompt">arcadia:/path/to</code> <strong class="userinput"><code>cd myProject</code></strong>
<code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>mkdir data</code></strong></pre><p>
	Put the FASTQ data into that <code class="filename">data</code> directory so
	that it now looks perhaps like this:
      </p><pre class="screen"><code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>ls -l data</code></strong>
-rw-r--r-- 1 bach users 263985896 2008-03-28 21:49 bchocwt_lane6.solexa.fastq</pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      I completely made up the file names above. You can name them anyway you
      want. And you can have them live anywhere on the hard-disk, you do not
      need to put them in this <code class="filename">data</code> directory. It's just
      the way I do it ... and it's where the example manifest files a bit
      further down in this chapter will look for the data files.
    </td></tr></table></div><p>
      We're almost finished with the setup. As I like to have things neatly separated, I always create a directory called <code class="filename">assemblies</code> which will hold my assemblies (or different trials) together. Let's quickly do that:
    </p><pre class="screen"><code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>mkdir assemblies</code></strong>
<code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>mkdir assemblies/1sttrial</code></strong>
<code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>cd assemblies/1sttrial</code></strong></pre></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_dn_ge_writing_a_simple_manifest_file"></a>5.2.2. 
	Writing a simple manifest file
      </h3></div></div></div><p>
	A manifest file is a configuration file for MIRA which tells it what
	type of assembly it should do and which data it should load. In this
	case we'll make a simple assembly of a genome with unpaired Illumina
	data
      </p><pre class="screen"># Example for a manifest describing a genome de-novo assembly with
# unpaired Illumina data

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,denovo,accurate</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# here comes the unpaired Illumina data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>SomeUnpairedIlluminaReadsIGotFromTheLab</code></em>
data = <em class="replaceable"><code>../../data/bchocwt_lane6.solexa.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em></code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	  Please look up the parameters of the manifest file in the main
	  manual or the example manifest files in the following section.
	</p><p>
	  The ones above basically say: make an accurate denovo assembly of
	  unpaired Illumina reads.
	</p></td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_dn_ge_starting_assembly"></a>5.2.3. Starting the assembly</h3></div></div></div><p>
	Starting the assembly is now just a matter of a simple command line:
      </p><pre class="screen"><code class="prompt">arcadia:/path/to/myProject/assemblies/1sttrial$</code> <strong class="userinput"><code>mira <em class="replaceable"><code>manifest.conf &gt;&amp;log_assembly.txt</code></em></code></strong></pre><p>
	For this example - if you followed the walk-through on how to prepare the data
	- everything you might want to adapt in the first time are the following thing in the manifest file:
	options:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    project= (for naming your assembly project)
	  </p></li></ul></div><p>
	Of course, you are free to change any option via the extended parameters, but
	this is the topic of another part of this manual.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_dn_manifest_files_use_cases"></a>5.3. 
      Manifest files for different use cases
    </h2></div></div></div><p>
      This section will introduce you to manifest files for different use
      cases. It should cover the most important uses, but as always you are
      free to mix and match the parameters and readgroup definitions to suit
      your specific needs.
    </p><p>
      Taking into account that there may be <span class="emphasis"><em>a lot</em></span> of
      combinations of sequencing technologies, sequencing libraries (shotgun,
      paired-end, mate-pair, etc.) and input file types (FASTQ, FASTA,
      GenBank, GFF3, etc.pp), the example manifest files just use Illumina and
      454 as technologies, GFF3 as input file type for the reference sequence,
      FASTQ as input type for sequencing data ... and they do not show the
      multitude of more advanced features like, e.g., using ancillary clipping
      information in XML files, ancillary masking information in SSAHA2 or
      SMALT files etc.pp.
    </p><p>
      I'm sure you will be able to find your way by scanning through the
      corresponding section on manifest files in the reference chapter :-)
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_dn_mf_denovo_with_shotgun_data"></a>5.3.1. 
	Manifest for shotgun data
      </h3></div></div></div><p>
	Well, we've seen that already in the section above, but here it is
	again ... but this time with 454 data.
      </p><pre class="screen"># Example for a manifest describing a denovo assembly with
# unpaired 454 data

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,denovo,accurate</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# here's the 454 data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>SomeUnpaired454ReadsIGotFromTheLab</code></em>
data = <em class="replaceable"><code>../../data/some454data.fastq</code></em>
technology = <em class="replaceable"><code>454</code></em></code></strong></pre></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_dn_mf_assembling_with_multiple_technologies"></a>5.3.2. 
	Assembling with multiple sequencing technologies (hybrid assemblies)
      </h3></div></div></div><p>
	Hybrid mapping assemblies follow the general manifest scheme: tell
	what you want in the first part, then simply add as separate readgroup
	the information MIRA needs to know to find the data and off you
	go. Just for laughs, here's a manifest for 454 shotgun with Illumina
	shotgun
      </p><pre class="screen"># Example for a manifest describing a denovo assembly with
# shotgun 454 and shotgun Illumina data

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,mapping,accurate</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# now the shotgun 454 data
<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataForShotgun454</code></em>
data = <em class="replaceable"><code>../../data/project454data.fastq</code></em>
technology = <em class="replaceable"><code>454</code></em></code></strong>

# now the shotgun Illumina data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataForShotgunIllumina</code></em>
data = <em class="replaceable"><code>../../data/someillumina.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em></code></strong></pre></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_dn_mf_manifest_for_pairedend_data"></a>5.3.3. 
	Manifest for data sets with paired reads
      </h3></div></div></div><p>
	When using paired-end data, you should know
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    the orientation of the reads toward each other. This is specific
	    to sequencing technologies and / or the sequencing library preparation.
	  </p></li><li class="listitem"><p>
	    at which distance these reads should be. This is specific to the
	    sequencing library preparation and the sequencing lab should tell
	    you this.
	  </p></li></ol></div><p>
	In case you do not know one (or any) of the above, don't panic! MIRA
	is able to estimate the needed values during the assembly if you tell
	it to.
      </p><p>
	The following manifest shows you the most laziest way to define a
	paired data set by simply adding <span class="emphasis"><em>autopairing</em></span> as keyword to a
	readgroup (using Illumina just as example):
      </p><pre class="screen"># Example for a lazy manifest describing a denovo assembly with
# one library of paired reads

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,denovo,accurate</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# now the Illumina paired-end data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataIlluminaPairedLib</code></em>
<em class="replaceable"><code>autopairing</code></em>
data = <em class="replaceable"><code>../../data/project_1.fastq ../../data/project_2.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em></code></strong></pre><p>
	If you know the orientation of the reads and/or the library size, you
	can tell this MIRA the following way (just showing the readgroup
	definition here):
      </p><pre class="screen"><strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataIlluminaPairedEnd500Lib</code></em>
data = <em class="replaceable"><code>../../data/project_1.fastq ../../data/project_2.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
template_size = <em class="replaceable"><code>250 750</code></em>
segment_placement = <em class="replaceable"><code>---&gt; &lt;---</code></em></code></strong></pre><p>
	In cases you are not 100% sure about, e.g., the size of the DNA
	template, you can also give a (generous) expected range and then tell
	MIRA to automatically refine this range during the assembly based on
	real, observed distances of read pairs. Do this with <span class="emphasis"><em>autorefine</em></span>
	modifier like this:
      </p><pre class="screen"><strong class="userinput"><code>template_size = <em class="replaceable"><code>50 1000 autorefine</code></em></code></strong></pre><p>
	The following manifest file is an example for assembling with several
	different libraries from different technologies. Do not forget you
	can use <span class="emphasis"><em>autopairing</em></span> or <span class="emphasis"><em>autorefine</em></span> :-)
      </p><pre class="screen"># Example for a manifest describing a denovo assembly with
# several kinds of sequencing libraries

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,denovo,accurate</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# now the Illumina paired-end data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataIlluminaForPairedEnd500bpLib</code></em>
data = <em class="replaceable"><code>../../data/project500bp-1.fastq ../../data/project500bp-2.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
strain = <em class="replaceable"><code>bchoc_se1</code></em>
template_size = <em class="replaceable"><code>250 750</code></em>
segment_placement = <em class="replaceable"><code>---&gt; &lt;---</code></em></code></strong>

# now the Illumina mate-pair data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataIlluminaForMatePair3kbLib</code></em>
data = <em class="replaceable"><code>../../data/project3kb-1.fastq ../../data/project3kb-2.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
strain = <em class="replaceable"><code>bchoc_se1</code></em>
template_size = <em class="replaceable"><code>2500 3500</code></em>
segment_placement = <em class="replaceable"><code>&lt;--- ---&gt;</code></em></code></strong>

# some Sanger data (6kb library)

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataForSanger6kbLib</code></em>
data = <em class="replaceable"><code>../../data/sangerdata.fastq</code></em>
technology = <em class="replaceable"><code>sanger</code></em>
template_size = <em class="replaceable"><code>5500 6500</code></em>
segment_placement = <em class="replaceable"><code>---&gt; &lt;---</code></em></code></strong>

# some 454 data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataFo454Pairs</code></em>
data = <em class="replaceable"><code>../../data/454data.fastq</code></em>
technology = <em class="replaceable"><code>454</code></em>
template_size = <em class="replaceable"><code>8000 1200</code></em>
segment_placement = <em class="replaceable"><code>2---&gt; 1---&gt;</code></em></code></strong>

# some Ion Torrent data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataFoIonPairs</code></em>
data = <em class="replaceable"><code>../../data/iondata.fastq</code></em>
technology = <em class="replaceable"><code>iontor</code></em>
template_size = <em class="replaceable"><code>1000 300</code></em>
segment_placement = <em class="replaceable"><code>2---&gt; 1---&gt;</code></em></code></strong></pre></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_dn_mf_denovo_with_multiple_strains"></a>5.3.4. 
	De-novo with multiple strains
      </h3></div></div></div><p>
	MIRA will make use of ancillary information present in the manifest
	file. One of these is the information to which strain (or organism or
	cell line etc.pp) the generated data belongs.
      </p><p>
	You just need to tell in the manifest file which data comes from which
	strain. Let's assume that in the example from above, the "lane6" data
	were from a first mutant named <span class="emphasis"><em>bchoc_se1</em></span> and the
	"lane7" data were from a second mutant
	named <span class="emphasis"><em>bchoc_se2</em></span>. Here's the manifest file you
	would write then:
      </p><pre class="screen"># Example for a manifest describing a de-novo assembly with
# unpaired Illumina data, but from multiple strains

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,denovo,accurate</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# now the Illumina data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataForSE1</code></em>
data = <em class="replaceable"><code>../../data/bchocse_lane6.solexa.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
strain = <em class="replaceable"><code>bchoc_se1</code></em></code></strong>

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataForSE2</code></em>
data = <em class="replaceable"><code>../../data/bchocse_lane7.solexa.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
strain = <em class="replaceable"><code>bchoc_se2</code></em></code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	While assembling de-novo (pr mapping) with multiple strains is
	possible, the interpretation of results may become a bit daunting in
	some cases. For many scenarios it might therefore be preferable to
	successively use the data sets in own assemblies or mappings.
      </td></tr></table></div><p>
	This <span class="emphasis"><em>strain</em></span> information for each readgroup is
	really the only change you need to perform to tell MIRA everything it
	needs for handling strains.
      </p></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_mapping"></a>Chapter 6. Mapping assemblies</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_map_introduction">6.1. 
      Introduction
    </a></span></dt><dt><span class="sect1"><a href="#sect_map_general">6.2. 
      General steps
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_map_ge_copying_and_naming_the_sequence_data">6.2.1. 
	Copying and naming the sequence data
      </a></span></dt><dt><span class="sect2"><a href="#sect_map_ma_copying_and_naming_the_reference_sequence">6.2.2. 
	Copying and naming the reference sequence
      </a></span></dt><dt><span class="sect2"><a href="#sect_map_ge_writing_a_simple_manifest_file">6.2.3. 
	Writing a simple manifest file
      </a></span></dt><dt><span class="sect2"><a href="#sect_map_ge_starting_assembly">6.2.4. Starting the assembly</a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_map_manifest_files_use_cases">6.3. 
      Manifest files for different use cases
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_map_mf_mapping_with_shotgun_data">6.3.1. 
	Mapping with shotgun data
      </a></span></dt><dt><span class="sect2"><a href="#sect_map_mf_manifest_for_pairedend_data">6.3.2. 
	Manifest for data sets with paired reads
      </a></span></dt><dt><span class="sect2"><a href="#sect_map_mf_mapping_with_multiple_technologies">6.3.3. 
	Mapping with multiple sequencing technologies (hybrid mapping)
      </a></span></dt><dt><span class="sect2"><a href="#sect_map_mf_mapping_with_multiple_strains">6.3.4. 
	Mapping with multiple strains
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_map_walkthroughs">6.4. 
      Walkthroughs
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_map_walkthrough:_mapping_of_ecoli_from_lenski_lab_against_ecoli_b_rel606">6.4.1. 
	Walkthrough: mapping of E.coli from Lenski lab against E.coli B REL606
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_map_useful_about_reference_sequences">6.5. 
      Useful things to know about reference sequences
    </a></span></dt><dt><span class="sect1"><a href="#sect_map_known_bugs_problems">6.6. 
      Known bugs / problems
    </a></span></dt></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">You have to know what you're looking for before you can find it.</span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_map_introduction"></a>6.1. 
      Introduction
    </h2></div></div></div><p>
      This guide assumes that you have basic working knowledge of Unix systems, know
      the basic principles of sequencing (and sequence assembly) and what assemblers
      do.
    </p><p>
      While there are step by step instructions on how to setup your data and
      then perform an assembly, this guide expects you to read at some point in time
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	  Before the mapping, <a class="xref" href="#chap_dataprep" title="Chapter 4. Preparing data">Chapter 4: &#8220;<i>Preparing data</i>&#8221;</a> to know what to do (or not to
	  do) with the sequencing data before giving it to MIRA.
	</p></li><li class="listitem"><p>
	  Generally, the <a class="xref" href="#chap_results" title="Chapter 9. Working with the results of MIRA">Chapter 9: &#8220;<i>Working with the results of MIRA</i>&#8221;</a> to know what to do with the
	  results of the assembly. More specifically, <a class="xref" href="#sect_res_converting_results" title="9.3.  Converting results">Section 9.3: &#8220;
      Converting results
    &#8221;</a> <a class="xref" href="#sect_res_places_of_interest_in_a_mapping_assembly" title="9.6.  Places of interest in a mapping assembly">Section 9.6: &#8220;
      Places of interest in a mapping assembly
    &#8221;</a> <a class="xref" href="#sect_res_postprocessing_mapping_assemblies" title="9.7.  Post-processing mapping assemblies">Section 9.7: &#8220;
      Post-processing mapping assemblies
    &#8221;</a>
	</p></li><li class="listitem"><p>
	  And also <a class="xref" href="#chap_reference" title="Chapter 3. MIRA 4 reference manual">Chapter 3: &#8220;<i>MIRA 4 reference manual</i>&#8221;</a> to look up how manifest files should be
	  written (<a class="xref" href="#sect_ref_manifest_basics" title="3.4.2.  The manifest file: basics">Section 3.4.2: &#8220;
	The manifest file: basics
      &#8221;</a> and <a class="xref" href="#sect_ref_manifest_readgroups" title="3.4.3.  The manifest file: information on the data you have">Section 3.4.3: &#8220;
	The manifest file: information on the data you have
      &#8221;</a> and <a class="xref" href="#sect_ref_manifest_parameters" title="3.4.4.  The manifest file: extended parameters">Section 3.4.4: &#8220;
	The manifest file: extended parameters
      &#8221;</a>), some command line options as well as general information on
	  what tags MIRA uses in assemblies, files it generates etc.pp
	</p></li><li class="listitem"><p>
	  Last but not least, you may be interested in some observations about
	  the different sequencing technologies and the traps they may
	  contain, see <a class="xref" href="#chap_seqtechdesc" title="Chapter 12. Description of sequencing technologies">Chapter 12: &#8220;<i>Description of sequencing technologies</i>&#8221;</a> for that. For advice on what to pay
	  attention to <span class="emphasis"><em>before</em></span> going into a sequencing
	  project, have a look at <a class="xref" href="#chap_seqadvice" title="Chapter 13. Some advice when going into a sequencing project">Chapter 13: &#8220;<i>Some advice when going into a sequencing project</i>&#8221;</a>.
	</p></li></ul></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_map_general"></a>6.2. 
      General steps
    </h2></div></div></div><p>
      This part will introduce you step by step how to get your data together for a
      simple mapping assembly.
    </p><p>
      I'll make up an example using an imaginary bacterium: <span class="emphasis"><em>Bacillus chocorafoliensis</em></span> (or short: <span class="emphasis"><em>Bchoc</em></span>).
    </p><p>
      In this example, we assume you have two strains: a wild type strain of
      <span class="emphasis"><em>Bchoc_wt</em></span> and a mutant which you perhaps got from mutagenesis or other
      means. Let's imagine that this mutant needs more time to eliminate a given
      amount of chocolate, so we call the mutant <span class="emphasis"><em>Bchoc_se</em></span> ... SE for
      <span class="bold"><strong>s</strong></span>low <span class="bold"><strong>e</strong></span>ater
    </p><p>
      You wanted to know which mutations might be responsible for the observed
      behaviour. Assume the genome of <span class="emphasis"><em>Bchoc_wt</em></span> is available to you as it was
      published (or you previously sequenced it), so you resequenced <span class="emphasis"><em>Bchoc_se</em></span>
      with Solexa to examine mutations.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_map_ge_copying_and_naming_the_sequence_data"></a>6.2.1. 
	Copying and naming the sequence data
      </h3></div></div></div><p>
	You need to create (or get from your sequencing provider) the sequencing data
	in either FASTQ or FASTA + FASTA quality format. The following walkthrough
	uses what most people nowadays get: FASTQ.
      </p><p>
	Create a new project directory (e.g. <code class="filename">myProject</code>) and a subdirectory of this which will hold the sequencing data (e.g. <code class="filename">data</code>).
      </p><pre class="screen"><code class="prompt">arcadia:/path/to</code> <strong class="userinput"><code>mkdir myProject</code></strong>
<code class="prompt">arcadia:/path/to</code> <strong class="userinput"><code>cd myProject</code></strong>
<code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>mkdir data</code></strong></pre><p>
	Put the FASTQ data into that <code class="filename">data</code> directory so that it now looks perhaps like this:
      </p><pre class="screen"><code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>ls -l data</code></strong>
-rw-r--r-- 1 bach users 263985896 2008-03-28 21:49 bchocse_lane6.solexa.fastq
-rw-r--r-- 1 bach users 264823645 2008-03-28 21:51 bchocse_lane7.solexa.fastq</pre></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      I completely made up the file names above. You can name them anyway you
      want. And you can have them live anywhere on the hard disk, you do not
      need to put them in this <code class="filename">data</code> directory. It's just
      the way I do it ... and it's where the example manifest files a bit further down
      in this chapter will look for the data files.
    </td></tr></table></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_map_ma_copying_and_naming_the_reference_sequence"></a>6.2.2. 
	Copying and naming the reference sequence
      </h3></div></div></div><p>
	The reference sequence (the backbone) can be in a number of different
	formats: GFF3, GenBank, MAF, CAF, FASTA. The first three have the advantage
	of being able to carry additional information like, e.g.,
	annotation. In this example, we will use a GFF3 file like the ones
	one can download from the NCBI.
      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	TODO: Write why GFF3 is better and where to get them at the NCBI.
      </td></tr></table></div><p>
	So, let's assume that our wild type
	strain is in the following file:
	<code class="filename">NC_someNCBInumber.gff3</code>.
      </p><p>
	You do not need to copy the reference sequence to your directory, but
	I normally copy also the reference file into the directory with my
	data as I want to have, at the end of my work, a nice little
	self-sufficient directory which I can archive away and still be sure
	that in 10 years time I have all data I need together.
      </p><pre class="screen"><code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>cp /somewhere/NC_someNCBInumber.gff3 data</code></strong>
<code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>ls -l data</code></strong>
-rw-r--r-- 1 bach users   6543511 2008-04-08 23:53 NC_someNCBInumber.gff3
-rw-r--r-- 1 bach users 263985896 2008-03-28 21:49 bchocse_lane6.solexa.fastq
-rw-r--r-- 1 bach users 264823645 2008-03-28 21:51 bchocse_lane7.solexa.fastq</pre><p>
      We're almost finished with the setup. As I like to have things neatly separated, I always create a directory called <code class="filename">assemblies</code> which will hold my assemblies (or different trials) together. Let's quickly do that:
    </p><pre class="screen"><code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>mkdir assemblies</code></strong>
<code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>mkdir assemblies/1sttrial</code></strong>
<code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>cd assemblies/1sttrial</code></strong></pre></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_map_ge_writing_a_simple_manifest_file"></a>6.2.3. 
	Writing a simple manifest file
      </h3></div></div></div><p>
	A manifest file is a configuration file for MIRA which tells it what
	type of assembly it should do and which data it should load. In this
	case we have unpaired sequencing data which we want to map to a
	reference sequence, the manifest file for that is pretty simple:
      </p><pre class="screen"># Example for a manifest describing a mapping assembly with
# unpaired Illumina data

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,mapping,accurate</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# first, the reference sequence
<strong class="userinput"><code>readgroup
is_reference
data = <em class="replaceable"><code>../../data/NC_someNCBInumber.gff3</code></em>
strain = <em class="replaceable"><code>bchoc_wt</code></em></code></strong>

# now the Illumina data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>SomeUnpairedIlluminaReadsIGotFromTheLab</code></em>
data = <em class="replaceable"><code>../../data/*fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
strain = <em class="replaceable"><code>bchoc_se</code></em></code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	  Please look up the parameters of the manifest file in the main
	  manual or the example manifest files in the following section.
	</p><p>
	  The ones above basically say: make an accurate mapping of Solexa
	  reads against a genome; in one pass; the name of the backbone strain
	  is 'bchoc_wt'; the data with the backbone sequence (and maybe
	  annotations) is in a specified GFF3 file; for Solexa data: assign
	  default strain names for reads which have not loaded ancillary data
	  with strain info and that default strain name should be 'bchoc_se'.
	</p></td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_map_ge_starting_assembly"></a>6.2.4. Starting the assembly</h3></div></div></div><p>
	Starting the assembly is now just a matter of a simple command line:
      </p><pre class="screen"><code class="prompt">arcadia:/path/to/myProject/assemblies/1sttrial$</code> <strong class="userinput"><code>mira <em class="replaceable"><code>manifest.conf &gt;&amp;log_assembly.txt</code></em></code></strong></pre><p>
	For this example - if you followed the walk-through on how to prepare the data
	- everything you might want to adapt in the first time are the following thing in the manifest file:
	options:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    project= (for naming your assembly project)
	  </p></li><li class="listitem"><p>
	    strain_name= to give the names of your reference and mapping strain
	  </p></li></ul></div><p>
	Of course, you are free to change any option via the extended parameters, but
	this is the topic of another part of this manual.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_map_manifest_files_use_cases"></a>6.3. 
      Manifest files for different use cases
    </h2></div></div></div><p>
      This section will introduce you to manifest files for different use
      cases. It should cover the most important uses, but as always you are
      free to mix and match the parameters and readgroup definitions to suit
      your specific needs.
    </p><p>
      Taking into account that there may be <span class="emphasis"><em>a lot</em></span> of
      combinations of sequencing technologies, sequencing libraries (shotgun,
      paired-end, mate-pair, etc.) and input file types (FASTQ, FASTA,
      GenBank, GFF3, etc.pp), the example manifest files just use Illumina and
      454 as technologies, GFF3 as input file type for the reference sequence,
      FASTQ as input type for sequencing data ... and they do not show the
      multitude of more advanced features like, e.g., using ancillary clipping
      information in XML files, ancillary masking information in SSAHA2 or
      SMALT files etc.pp.
    </p><p>
      I'm sure you will be able to find your way by scanning through the
      corresponding section on manifest files in the reference chapter :-)
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_map_mf_mapping_with_shotgun_data"></a>6.3.1. 
	Mapping with shotgun data
      </h3></div></div></div><p>
	Well, we've seen that already in the section above, but here it is
	again ... this time with Ion Torrent data though.
      </p><pre class="screen"># Example for a manifest describing a mapping assembly with
# unpaired Ion data

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,mapping,accurate</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# first, the reference sequence
<strong class="userinput"><code>readgroup
is_reference
data = <em class="replaceable"><code>../../data/NC_someNCBInumber.gff3</code></em>
strain = <em class="replaceable"><code>bchoc_wt</code></em></code></strong>

# now the Ion Torrent data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>SomeUnpairedIonReadsIGotFromTheLab</code></em>
data = <em class="replaceable"><code>../../data/someiondata.fastq</code></em>
technology = <em class="replaceable"><code>iontor</code></em>
strain = <em class="replaceable"><code>bchoc_se</code></em></code></strong></pre></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_map_mf_manifest_for_pairedend_data"></a>6.3.2. 
	Manifest for data sets with paired reads
      </h3></div></div></div><p>
      </p><p>
	When using paired-end data in mapping, you must decide whether you want
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    use the MIRA feature to create long 'coverage equivalent reads'
	    (CERs) which saves a lot of memory (both in the assembler and
	    later on in an assembly editor). However, you then
	    <span class="emphasis"><em>loose information about read pairs!</em></span>
	  </p></li><li class="listitem"><p>
	    or whether you want to <span class="emphasis"><em>keep information about read
	    pairs</em></span> at the expense of larger memory requirements both
	    in MIRA and in assembly finishing tools or viewers afterwards.
	  </p></li><li class="listitem"><p>
	    or a mix of the two above
	  </p></li></ol></div><p>
	The Illumina pipeline generally normally gives you two files for paired-end
	data: a <code class="filename">project-1.fastq</code> and
	<code class="filename">project-2.fastq</code>. The first file containing the
	first read of a read-pair, the second file the second read. Depending
	on the preprocessing pipeline of your sequencing provider, the names
	of the reads are either the very same in both files or already have
	a <code class="literal">/1</code> or <code class="literal">/2</code> appended. Also, your
	sequencing provider may give you one big file where the reads from
	both ends are present.
      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	  MIRA can read all FASTQ variants produced by various Illumina
	  pipelines, be they with or without the /1 and /2 already appended to
	  the names. You generally do not need to do any name mangling before
	  feeding the data to MIRA. However, MIRA will shell out a warning if read names are longer than 40 characters.
	</p></td></tr></table></div><p>
	When using paired-end data, you should know
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    the orientation of the reads toward each other. This is specific
	    to sequencing technologies and / or the sequencing library preparation.
	  </p></li><li class="listitem"><p>
	    at which distance these reads should be. This is specific to the
	    sequencing library preparation and the sequencing lab should tell
	    you this.
	  </p></li></ol></div><p>
	In case you do not know one (or any) of the above, don't panic! MIRA
	is able to estimate the needed values during the assembly if you tell
	it to.
      </p><p>
	The following manifest shows you the most laziest way to define a
	paired data set by simply adding <span class="emphasis"><em>autopairing</em></span> as keyword to a
	readgroup (using Illumina just as example):
      </p><pre class="screen"># Example for a lazy manifest describing a denovo assembly with
# one library of paired reads

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,mapping,accurate</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# first the reference sequence
<strong class="userinput"><code>readgroup
is_reference
data = <em class="replaceable"><code>../../data/NC_someNCBInumber.gff3</code></em>
technology = <em class="replaceable"><code>text</code></em>
strain = <em class="replaceable"><code>bchoc_wt</code></em></code></strong>

# now the Illumina paired-end data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataIlluminaPairedLib</code></em>
<em class="replaceable"><code>autopairing</code></em>
data = <em class="replaceable"><code>../../data/project_1.fastq ../../data/project_2.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
strain = <em class="replaceable"><code>bchoc_se1</code></em>
</code></strong></pre><p>
	See? Wasn't hard and it did not hurt, did it? One just needs to tell
	MIRA it should expect paired reads via
	the <span class="emphasis"><em>autopairing</em></span> keyword and that is everything you
	need.
      </p><p>
	If you know the orientation of the reads and/or the library size, you
	can tell this MIRA the following way (just showing the readgroup
	definition here):
      </p><pre class="screen"><strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataIlluminaPairedEnd500Lib</code></em>
data = <em class="replaceable"><code>../../data/project_1.fastq ../../data/project_2.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
template_size = <em class="replaceable"><code>250 750</code></em>
segment_placement = <em class="replaceable"><code>---&gt; &lt;---</code></em></code></strong></pre><p>
	In cases you are not 100% sure about, e.g., the size of the DNA
	template, you can also give a (generous) expected range and then tell
	MIRA to automatically refine this range during the assembly based on
	real, observed distances of read pairs. Do this with <span class="emphasis"><em>autorefine</em></span>
	modifier like this:
      </p><pre class="screen"><strong class="userinput"><code>template_size = <em class="replaceable"><code>50 1000 autorefine</code></em></code></strong></pre><p>
	The following manifest file is an example for mapping a 500 bp
	paired-end and a 3kb mate-pair library of a strain
	called <span class="emphasis"><em>bchoc_se1</em></span> against a GenBank reference
	file containing a strain called <span class="emphasis"><em>bchoc_wt</em></span>:
      </p><pre class="screen"># Example for a manifest describing a mapping assembly with
# paired Illumina data, not merging reads and therefore keeping
# all pair information

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode
# As special parameter, we want to switch off merging of Solexa reads

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,mapping,accurate</code></em>
parameters = <em class="replaceable"><code>SOLEXA_SETTINGS -CO:msr=no</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# first, the reference sequence
<strong class="userinput"><code>readgroup
is_reference
data = <em class="replaceable"><code>../../data/NC_someNCBInumber.gff3</code></em>
technology = <em class="replaceable"><code>text</code></em>
strain = <em class="replaceable"><code>bchoc_wt</code></em></code></strong>

# now the Illumina data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataForPairedEnd500bpLib</code></em>
<em class="replaceable"><code>autopairing</code></em>
data = <em class="replaceable"><code>../../data/project500bp-1.fastq ../../data/project500bp-2.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
strain = <em class="replaceable"><code>bchoc_se1</code></em></code></strong>

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataForMatePair3kbLib</code></em>
data = <em class="replaceable"><code>../../data/project3kb-1.fastq ../../data/project3kb-2.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
strain = <em class="replaceable"><code>bchoc_se1</code></em>
template_size = <em class="replaceable"><code>2000 4000 autorefine</code></em>
segment_placement = <em class="replaceable"><code>&lt;--- ---&gt;</code></em></code></strong></pre><p>
	Please look up the parameters used in the main manual. The ones
	above basically say: make an accurate mapping of Solexa reads
	against a genome. Additionally do not merge short short Solexa
	reads to the contig.
      </p><p>
	For the paired-end library, be lazy and let MIRA find out everything
	it needs. However, that information should be treated as
	"information only" by MIRA, i.e., it is not used for deciding whether
	a pair is well mapped.
      </p><p>
	For the mate-pair library, assume a DNA template template size of
	2000 to 4000 bp (but let MIRA automatically refine this using observed
	distances) and the segment orientation of the read pairs follows
	the reverse / forward scheme. That information should be treated as
	"information only" by MIRA, i.e., it is not used for deciding whether
	a pair is well mapped.
      </p><p>
	Comparing this manifest with a manifest for unpaired-data, two
	parameters were added in the section for Solexa data:
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    <code class="literal">-CO:msr=no</code> tells MIRA not to merge reads that
	    are 100% identical to the backbone. This also allows to keep the
	    template information (distance and orientation) for the reads.
	  </p></li><li class="listitem"><p>
	    <code class="literal">template_size</code> tells MIRA at which distance the
	    two reads should normally be placed from each other.
	  </p></li><li class="listitem"><p>
	    <code class="literal">segment_placement</code> tells MIRA how the different
	    segments (reads) of a DNA template have to be ordered to form a
	    valid representation of the sequenced DNA.
	  </p></li></ol></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	  Note that in mapping assemblies, these
	  <code class="literal">template_distance</code> and
	  <code class="literal">segment_placement</code> parameters are normally treated
	  as <span class="emphasis"><em>information only</em></span>, i.e., MIRA will map the
	  reads regardless whether the distance and orientation criterions are
	  met or not. This enables post-mapping analysis programs to hunt for
	  genome rearrangements or larger insertions/deletion.
	</p></td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top"><p>
	  If template size and segment placement checking were on, the
	  following would happen at, e.g. sites of re-arrangement: MIRA would
	  map the first read of a read-pair without problem. However, it would
	  very probably reject the second read because it would not map at the
	  specified distance or orientation from its partner. Therefore, in
	  mapping assemblies with paired-end data, checking of the template
	  size must be switched off to give post-processing programs a chance
	  to spot re-arrangements.
	</p></td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_map_mf_mapping_with_multiple_technologies"></a>6.3.3. 
	Mapping with multiple sequencing technologies (hybrid mapping)
      </h3></div></div></div><p>
	I'm sure you'll have picked up the general scheme of manifest files by
	now. Hybrid mapping assemblies follow the general scheme: simply add
	as separate readgroup the information MIRA needs to know to find the
	data and off you go. Just for laughs, here's a manifest for 454
	shotgun with Illumina paired-end
      </p><pre class="screen"># Example for a manifest describing a mapping assembly with
# shotgun 454 and paired-end Illumina data, not merging reads and therefore keeping
# all pair information

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode
# As special parameter, we want to switch off merging of Solexa reads

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,mapping,accurate</code></em>
parameters = <em class="replaceable"><code>SOLEXA_SETTINGS -CO:msr=no</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# first, the reference sequence
<strong class="userinput"><code>readgroup
is_reference
data = <em class="replaceable"><code>../../data/NC_someNCBInumber.gff3</code></em>
strain = <em class="replaceable"><code>bchoc_wt</code></em></code></strong>

# now the shotgun 454 data
<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataForShotgun454</code></em>
data = <em class="replaceable"><code>../../data/project454data.fastq</code></em>
technology = <em class="replaceable"><code>454</code></em>
strain = <em class="replaceable"><code>bchoc_se1</code></em></code></strong>

# now the paired-end Illumina data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataForPairedEnd500bpLib</code></em>
data = <em class="replaceable"><code>../../data/project500bp-1.fastq ../../data/project500bp-2.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
strain = <em class="replaceable"><code>bchoc_se1</code></em>
template_size = <em class="replaceable"><code>250 750</code></em>
segment_placement = <em class="replaceable"><code>---&gt; &lt;---</code></em></code></strong></pre></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_map_mf_mapping_with_multiple_strains"></a>6.3.4. 
	Mapping with multiple strains
      </h3></div></div></div><p>
	MIRA will make use of ancillary information present in the manifest
	file. One of these is the information to which strain (or organism or
	cell line etc.pp) the generated data belongs.
      </p><p>
	You just need to tell in the manifest file which data comes from which
	strain. Let's assume that in the example from above, the "lane6" data
	were from a first mutant named <span class="emphasis"><em>bchoc_se1</em></span> and the
	"lane7" data were from a second mutant
	named <span class="emphasis"><em>bchoc_se2</em></span>. Here's the manifest file you
	would write then:
      </p><pre class="screen"># Example for a manifest describing a mapping assembly with
# unpaired Illumina data

# First part: defining some basic things
# In this example, we just give a name to the assembly
#  and tell MIRA it should map a genome in accurate mode

<strong class="userinput"><code>project = <em class="replaceable"><code>MyFirstAssembly</code></em>
job = <em class="replaceable"><code>genome,mapping,accurate</code></em></code></strong>

# The second part defines the sequencing data MIRA should load and assemble
# The data is logically divided into "readgroups"

# first, the reference sequence
<strong class="userinput"><code>readgroup
is_reference
data = <em class="replaceable"><code>../../data/NC_someNCBInumber.gff3</code></em>
technology = <em class="replaceable"><code>text</code></em>
strain = <em class="replaceable"><code>bchoc_wt</code></em></code></strong>

# now the Illumina data

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataForSE1</code></em>
data = <em class="replaceable"><code>../../data/bchocse_lane6.solexa.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
strain = <em class="replaceable"><code>bchoc_se1</code></em></code></strong>

<strong class="userinput"><code>readgroup = <em class="replaceable"><code>DataForSE2</code></em>
data = <em class="replaceable"><code>../../data/bchocse_lane7.solexa.fastq</code></em>
technology = <em class="replaceable"><code>solexa</code></em>
strain = <em class="replaceable"><code>bchoc_se2</code></em></code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	While mapping (or even assembling de-novo) with multiple strains is
	possible, the interpretation of results may become a bit daunting in
	some cases. For many scenarios it might therefore be preferable to
	successively use the data sets in own mappings or assemblies.
      </td></tr></table></div><p>
	This <span class="emphasis"><em>strain</em></span> information for each readgroup is really the only change you need to perform to tell MIRA everything it needs for handling strains.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_map_walkthroughs"></a>6.4. 
      Walkthroughs
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_map_walkthrough:_mapping_of_ecoli_from_lenski_lab_against_ecoli_b_rel606"></a>6.4.1. 
	Walkthrough: mapping of E.coli from Lenski lab against E.coli B REL606
      </h3></div></div></div><p>
	TODO: Sorry, needs to be re-written for the relatively new SRR format
	distributed at the NCBI ... and changes in MIRA 3.9.x. Please come
	back later.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_map_useful_about_reference_sequences"></a>6.5. 
      Useful things to know about reference sequences
    </h2></div></div></div><p>
      There are a few things to consider when using reference sequences:
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  MIRA is not really made to handle a big amount of reference
	  sequences as they currently need inane amounts of memory. Use other
	  programs for mapping against more than, say, 200 megabases.
	</p></li><li class="listitem"><p>
	  Reference sequences can be as long as needed! They are not subject
	  to normal read length constraints of a maximum of 32k bases. That
	  is, if one wants to load one or several entire chromosomes of a
	  bacterium or lower eukaryote as backbone sequence(s), this is just
	  fine.
	</p></li><li class="listitem"><p>
	  Reference sequences can be single sequences like provided in, e.g.,
	  FASTA, FASTQ, GFF or GenBank files. But reference sequences also can
	  be whole assemblies when they are provided as, e.g., MAF or CAF
	  format. This opens the possibility to perform semi-hybrid assemblies
	  by assembling first reads from one sequencing technology de-novo
	  (e.g. PacBio) and then map reads from another sequencing technology
	  (e.g. Solexa) to the whole PacBio alignment instead of mapping it to
	  the PacBio consensus.
	</p><p>
	  A semi-hybrid assembly will therefore contain, like a hybrid
	  assembly, the reads of both sequencing technologies.
	</p></li><li class="listitem"><p>
	  Reference sequences will not be reversed! They will always appear in
	  forward direction in the output of the assembly. Please note: if the
	  backbone sequence consists of a MAF or CAF file that contain contigs
	  which contain reversed reads, then the contigs themselves will be in
	  forward direction. But the reads they contain that are in reverse
	  complement direction will of course also stay reverse complement
	  direction.
	</p></li><li class="listitem"><p>
	  Reference sequences will not not be assembled together! That is,
	  even if a reference sequence has a perfect overlap with another
	  reference sequence, they will still not be merged.
	</p></li><li class="listitem"><p>
	  Reads are assembled to reference sequences in a first come, first
	  served scattering strategy.
	</p><p>
	  Suppose you have two identical reference sequences and a read which
	  would match both, then the read would be mapped to the first
	  backbone. If you had two identical reads, the first read would go to
	  the first backbone, the second read to the second backbone. With
	  three identical reads, the first backbone would get two reads, the
	  second backbone one read. Etc.pp.
	</p></li><li class="listitem"><p>
	  Only in references loaded from MAF or CAF files: contigs made out of
	  single reads (singlets) loose their status as reference sequence and
	  will be returned to the normal read pool for the assembly
	  process. That is, these sequences will be assembled to other
	  reference sequences or with each other.
	</p></li></ol></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_map_known_bugs_problems"></a>6.6. 
      Known bugs / problems
    </h2></div></div></div><p>
      These are actual for version 4.0 of MIRA and might or might not have been
      addressed in later version.
    </p><p>
      Bugs:
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  mapping of paired-end reads with one read being in non-repetitive
	  area and the other in a repeat is not as effective as it should
	  be. The optimal strategy to use would be to map first the
	  non-repetitive read and then the read in the repeat. Unfortunately,
	  this is not yet implemented in MIRA.
	</p></li></ol></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_est"></a>Chapter 7. EST / RNASeq assemblies</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect1_est_introduction">7.1. 
      Introduction
    </a></span></dt><dt><span class="sect1"><a href="#sect1_est_preliminaries:on_the_difficulties_of_assembling_ests">7.2. 
      Preliminaries: on the difficulties of assembling ESTs
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect2_est_poly-a_tails_in_est_data">7.2.1. 
	Poly-A tails in EST data
      </a></span></dt><dt><span class="sect2"><a href="#sect2_est_lowly_expressed_transcripts">7.2.2. 
	Lowly expressed transcripts
      </a></span></dt><dt><span class="sect2"><a href="#sect_est_chimeras">7.2.3. 
	Chimeras
      </a></span></dt><dt><span class="sect2"><a href="#sect2_est_library_normalisation">7.2.4. 
	Missing library normalisation: very highly expressed transcripts
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#est_sect1_est_preprocessing">7.3. 
      Preprocessing of ESTs
    </a></span></dt><dt><span class="sect1"><a href="#sect1_est_est_difference_assembly_clustering">7.4. 
      The difference between <span class="emphasis"><em>assembly</em></span> and
      <span class="emphasis"><em>clustering</em></span>
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect2_est_snp_splitting">7.4.1. 
	Splitting transcripts into contigs based on SNPs
      </a></span></dt><dt><span class="sect2"><a href="#sect2_est_gap_splitting">7.4.2. 
	Splitting transcripts into contigs based on larger gaps
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect1_est_mira_and_mirasearchestsnps">7.5. 
      mira and miraSearchESTSNPs
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect2_using_mira_for_est_assembly">7.5.1. 
	Using mira for EST assembly
      </a></span></dt><dt><span class="sect2"><a href="#sect2_using_mira_for_est_clustering">7.5.2. 
	Using mira for EST clustering
      </a></span></dt><dt><span class="sect2"><a href="#sect2_using_mirasearchestsnps_for_est_assembly">7.5.3. 
	Using miraSearchESTSNPs for EST assembly
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#idp42648112">7.6. 
      Solving common problems of EST assemblies
    </a></span></dt></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">Expect the worst. You'll never get disappointed.
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect1_est_introduction"></a>7.1. 
      Introduction
    </h2></div></div></div><p>
      This document is not complete yet and some sections may be a bit
      unclear. I'd be happy to receive suggestions for improvements.
    </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note: 
	Some reading requirements
      "><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">
	Some reading requirements
      </th></tr><tr><td align="left" valign="top"><p>
	This guide assumes that you have basic working knowledge of Unix systems, know
	the basic principles of sequencing (and sequence assembly) and what assemblers
	do.
      </p><p>
	Basic knowledge on mRNA transcription and EST sequences should also be
	present.
      </p><p>
	Please read at some point in time
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    Before the assembly, <a class="xref" href="#chap_dataprep" title="Chapter 4. Preparing data">Chapter 4: &#8220;<i>Preparing data</i>&#8221;</a> to know what to do (or not to
	    do) with the sequencing data before giving it to MIRA.
	  </p></li><li class="listitem"><p>
	    For setting up the assembly, <a class="xref" href="#chap_denovo" title="Chapter 5. De-novo assemblies">Chapter 5: &#8220;<i>De-novo assemblies</i>&#8221;</a> to know how to
	    start a denovo assembly (except you obviously will need to change
	    the --job setting from <span class="emphasis"><em>genome</em></span> to
	    <span class="emphasis"><em>est</em></span>).
	  </p></li><li class="listitem"><p>
	    After the assembly, <a class="xref" href="#chap_results" title="Chapter 9. Working with the results of MIRA">Chapter 9: &#8220;<i>Working with the results of MIRA</i>&#8221;</a> to know what to do with the
	    results of the assembly. More specifically, <a class="xref" href="#sect_res_looking_at_results" title="9.1.  MIRA output directories and files">Section 9.1: &#8220;
      MIRA output directories and files
    &#8221;</a>, <a class="xref" href="#sect_res_first_look:the_assembly_info" title="9.2.  First look: the assembly info">Section 9.2: &#8220;
      First look: the assembly info
    &#8221;</a>, <a class="xref" href="#sect_res_converting_results" title="9.3.  Converting results">Section 9.3: &#8220;
      Converting results
    &#8221;</a>, <a class="xref" href="#sect_res_filtering_of_results" title="9.4.  Filtering results">Section 9.4: &#8220;
      Filtering results
    &#8221;</a> and <a class="xref" href="#sect_res_places_of_importance_in_a_de_novo_assembly" title="9.5.  Places of importance in a de-novo assembly">Section 9.5: &#8220;
      Places of importance in a de-novo assembly
    &#8221;</a>.
	  </p></li><li class="listitem"><p>
	    And also <a class="xref" href="#chap_reference" title="Chapter 3. MIRA 4 reference manual">Chapter 3: &#8220;<i>MIRA 4 reference manual</i>&#8221;</a> to look up how manifest files should be
	    written (<a class="xref" href="#sect_ref_manifest_basics" title="3.4.2.  The manifest file: basics">Section 3.4.2: &#8220;
	The manifest file: basics
      &#8221;</a> and <a class="xref" href="#sect_ref_manifest_readgroups" title="3.4.3.  The manifest file: information on the data you have">Section 3.4.3: &#8220;
	The manifest file: information on the data you have
      &#8221;</a> and <a class="xref" href="#sect_ref_manifest_parameters" title="3.4.4.  The manifest file: extended parameters">Section 3.4.4: &#8220;
	The manifest file: extended parameters
      &#8221;</a>), some command line options as well as general information on
	    what tags MIRA uses in assemblies, files it generates etc.pp
	  </p></li></ul></div></td></tr></table></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect1_est_preliminaries:on_the_difficulties_of_assembling_ests"></a>7.2. 
      Preliminaries: on the difficulties of assembling ESTs
    </h2></div></div></div><p>
      Assembling ESTs can be, from an assemblers point of view, pure
      horror. E.g., it may be that some genes have thousands of transcripts
      while other genes have just one single transcript in the sequenced
      data. Furthermore, the presence of 5' and 3' UTR, transcription
      variants, splice variants, homologues, SNPs etc.pp complicates the
      assembly in some rather interesting ways.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_est_poly-a_tails_in_est_data"></a>7.2.1. 
	Poly-A tails in EST data
      </h3></div></div></div><p>
	Poly-A tails are part of the mRNA and therefore also part of sequenced
	data. They can occur as poly-A or poly-T, depending from which
	direction and which part of the mRNA was sequenced. Having poly-A/T
	tails in the data is a something of a double edged sword. More
	specifically., if the 3' poly-A tail is kept unmasked in the data,
	transcripts having this tail will very probably not align with similar
	transcripts from different splice variants (which is basically
	good). On the other hand, homopolymers (multiple consecutive bases of
	the same type) like poly-As are features that are pretty difficult to
	get correct with today's sequencing technologies, be it Sanger, Solexa
	or, with even more problems problems, 454. So slight errors in the
	poly-A tail could lead to wrongly assigned splice sites ... and
	wrongly split contigs.
      </p><p>
	This is the reason why many people cut off the poly-A tails. Which in
	turn may lead to transcripts from different splice variants being
	assembled together.
      </p><p>
	Either way, it's not pretty.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_est_lowly_expressed_transcripts"></a>7.2.2. 
	Lowly expressed transcripts
      </h3></div></div></div><p>
	Single transcripts (or very lowly expressed transcripts) containing
	SNPs, splice variants or similar differences to other, more highly
	expressed transcripts are a problem: it's basically impossible for an
	assembler to distinguish them from reads containing junky data
	(e.g. read with a high error rate or chimeras). The standard setting
	of many EST assemblers and clusterers is therefore to remove these
	reads from the assembly set. MIRA handles things a bit differently:
	depending on the settings, single transcripts with sufficiently large
	differences are either treated as debris or can be saved as
	<span class="emphasis"><em>singlet</em></span>.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_est_chimeras"></a>7.2.3. 
	Chimeras
      </h3></div></div></div><p>
	Chimeras are sequences containing adjacent base stretches which are
	not occurring in an organism as sequenced, neither as DNA nor as
	(m)RNA. Chimeras can be created through recombination effects during
	library construction or sequencing. Chimeras can, and often do, lead
	to misassemblies of sequence stretches into one contig although they
	do not belong together. Have a look at the following example where two
	stretches (denoted by <code class="literal">x</code> and <code class="literal">o</code>
	are joined by a chimeric read <span class="emphasis"><em>r4</em></span> containing both
	stretches:
      </p><pre class="screen">
r1 xxxxxxxxxxxxxxxx
r2 xxxxxxxxxxxxxxxxx
r3 xxxxxxxxxxxxxxxxx
r4 xxxxxxxxxxxxxxxxxxx|oooooooooooooo
r5                        ooooooooooo
r6                        ooooooooooo
r7                          ooooooooo</pre><p>
      The site of the recombination event is denoted by <code class="literal">x|o</code>
      in read <span class="emphasis"><em>r4</em></span>.
    </p><p>
      MIRA does have a chimera detection -- which works very well in genome
      assemblies due to high enough coverage -- by searching for sequence
      stretches which are not covered by overlaps. In the above example, the
      chimera detection routine will almost certainly flag read
      <span class="emphasis"><em>r4</em></span> as chimera and only use a part of it: either the
      <code class="literal"> x</code> or <code class="literal">o</code> part, depending on which
      part is longer. There is always a chance that <span class="emphasis"><em>r4</em></span> is
      a valid read though, but that's a risk to take.
    </p><p>
      Now, that strategy would also work totally fine in EST projects if one
      would not have to account for lowly expressed genes. Imagine the
      following situation:
    </p><pre class="screen">
s1 xxxxxxxxxxxxxxxxx
s2         xxxxxxxxxxxxxxxxxxxxxxxxx
s3                          xxxxxxxxxxxxxxx
    </pre><p>
      Look at read <span class="emphasis"><em>s2</em></span>; from an overlap coverage
      perspective, <span class="emphasis"><em>s2</em></span> could also very well be a chimera,
      leading to a break of an otherwise perfectly valid contig if
      <span class="emphasis"><em>s2</em></span> were cut back accordingly. This is why chimera
      detection is switched off by default in MIRA.
    </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top"><p>
	When starting an EST assembly via the <code class="literal">--job=est,...</code>
	switch, chimera detection is switched off by default. It is absolutely
	possible to switch on the SKIM chimera detection afterwards via
	 [-CL:ascdc]. However, this will have exactly the effects
	described above: chimeras in higher coverage contigs will be detected,
	but perfectly valid low coverage contigs will be torn apart.
      </p><p>
	It is up to you to decide what you want or need.
      </p></td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_est_library_normalisation"></a>7.2.4. 
	Missing library normalisation: very highly expressed transcripts
      </h3></div></div></div><p>
	Another interesting problem for de-novo assemblers are non-normalised
	EST libraries. In each cell, the number of mRNA copies per gene may
	differ by several orders of magnitude, from a single transcripts to
	several tens of thousands. Pre-sequencing normalisation is a wet-lab
	procedure to approximately equalise those copy numbers. This can
	however, introduce other artifacts.
      </p><p>
	If an assembler is fed with non-normalised EST data, it may very well
	be that an overwhelming number of the reads comes only from a few
	genes (house-keeping genes). In Sanger sequencing projects this could
	mean a couple of thousand reads per gene. In 454 sequencing projects,
	this can mean several tens of thousands of reads per genes. With
	Solexa data, this number can grow to something close to a million.
      </p><p>
	Several effects then hit a de-novo assembler, the three most annoying
	being (in ascending order of annoyance): a) non-random sequencing
	errors then look like valid SNPs, b) sequencing and library
	construction artefacts start to look like valid sequences if the data
	set was not cleaned "enough" and more importantly, c) an explosion in
	time and memory requirements when attempting to deliver a "good"
	assembly. A sure sign of the latter are messages from MIRA about
	<span class="emphasis"><em>megahubs</em></span> in the data set.
      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	The guide on how to tackle <span class="emphasis"><em>hard</em></span> projects with
	MIRA gives an overview on how to hunt down sequences which can lead to
	the assembler getting confused, be it sequencing artefacts or highly
	expressed genes.
      </td></tr></table></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="est_sect1_est_preprocessing"></a>7.3. 
      Preprocessing of ESTs
    </h2></div></div></div><p>
      With contributions from Katrina Dlugosch
    </p><p>
      EST sequences necessarily contain fragments of vectors or primers used
      to create cDNA libraries from RNA, and may additionally contain primer
      and adaptor sequences used during amplification-based library
      normalisation and/or high-throughput sequencing.  These contaminant
      sequences need to be removed prior to assembly.  MIRA can trim sequences
      by taking contaminant location information from a SSAHA2 or SMALT search
      output, or users can remove contaminants beforehand by trimming
      sequences themselves or masking unwanted bases with lowercase or other
      characters (e.g. 'x', as with <span class="command"><strong>cross_match</strong></span>).  Many
      folks use preprocessing trimming/masking pipelines because it can be
      very important to try a variety of settings to verify that you've
      removed all of your contaminants (and fragments thereof) before sending
      them into an assembly program like MIRA.  It can also be good to spend
      some time seeing what contaminants are in your data, so that you get to
      know what quality issues are present and how pervasive.
    </p><p>
      Two features of next generation sequencing can introduce errors into
      contaminant sequences that make them particularly difficult to remove,
      arguing for preprocessing: First, most next-generation sequence
      platforms seem to be sensitive to excess primers present during library
      preparation, and can produce a small percentage of sequences composed
      entirely of concatenated primer fragments.  These are among the most
      difficult contaminants to remove, and the program TagDust (<a class="ulink" href="http://genome.gsc.riken.jp/osc/english/dataresource/" target="_top">http://genome.gsc.riken.jp/osc/english/dataresource/</a>) was
      recently developed specifically to address this problem. Second, 454 EST
      data sets can show high variability within primer sequences designed to
      anchor to polyA tails during cDNA synthesis, because 454 has trouble
      calling the length of the necessary A and T nucleotide repeats with
      accuracy.
    </p><p>
      A variety of programs exist for preprocessing.  Popular ones include
      cross_match (<a class="ulink" href="http://www.phrap.org/phredphrapconsed.html" target="_top">http://www.phrap.org/phredphrapconsed.html</a>)
      for primer masking, and SeqClean (<a class="ulink" href="http://compbio.dfci.harvard.edu/tgi/software/" target="_top">http://compbio.dfci.harvard.edu/tgi/software/</a>), Lucy (<a class="ulink" href="http://lucy.sourceforge.net/" target="_top">http://lucy.sourceforge.net/</a>), and SeqTrim (<a class="ulink" href="http://www.scbi.uma.es/cgi-bin/seqtrim/seqtrim_login.cgi" target="_top">http://www.scbi.uma.es/cgi-bin/seqtrim/seqtrim_login.cgi</a>) for
      both primer and polyA/T trimming.  The pipeline SnoWhite (<a class="ulink" href="http://evopipes.net" target="_top">http://evopipes.net</a>) combines Seqclean and TagDust with custom
      scripts for aggressive sequence and polyA/T trimming (and is tolerant of
      data already masked using cross_match).  In all cases, the user must
      provide contaminant sequence information and adjust settings for how
      sensitive the programs should be to possible matches.  To find the best
      settings, it is helpful to look directly at some of the sequences that
      are being trimmed and inspect them for remaining primer and/or polyA/T
      fragments after cleaning.
    </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
      When using <span class="command"><strong>mira</strong></span> or
      <span class="command"><strong>miraSearchESTSNPs</strong></span> with the the simplest parameter
      calls (using the "--job=..." quick switches), the default settings used
      include pretty heavy sequence pre-processing to cope with noisy
      data. Especially if you have your own pre-processing pipeline, you
      <span class="emphasis"><em>must</em></span> then switch off different clip algorithms that
      you might have applied previously yourself. Especially poly-A clips
      should never be run twice (by your pipeline and by
      <span class="command"><strong>mira</strong></span>) as they invariably lead to too many bases being
      cut away in some sequences,
    </td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      Here too: In some cases MIRA can get confused if something with the
      pre-processing went wrong because, e.g., unexpected sequencing artefacts
      like unknown sequencing vectors or adaptors remain in data. The guide on
      how to tackle <span class="emphasis"><em>hard</em></span> projects with MIRA gives an
      overview on how to hunt down sequences which can lead to the assembler
      getting confused, be it sequencing artefacts or highly expressed genes.
    </td></tr></table></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect1_est_est_difference_assembly_clustering"></a>7.4. 
      The difference between <span class="emphasis"><em>assembly</em></span> and
      <span class="emphasis"><em>clustering</em></span>
    </h2></div></div></div><p>
      MIRA in its base settings is an <span class="emphasis"><em>assembler</em></span> and not a
      <span class="emphasis"><em>clusterer</em></span>, although it can be configured as such. As
      assembler, it will split up read groups into different contigs if it
      thinks there is enough evidence that they come from different RNA
      transcripts.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_est_snp_splitting"></a>7.4.1. 
	Splitting transcripts into contigs based on SNPs
      </h3></div></div></div><p>
	Imagine this simple case: a gene has two slightly different alleles and you've
	sequenced this:
      </p><pre class="screen">
A1-1  ...........T...........
A1-2  ...........T...........
A1-3  ...........T...........
A1-4  ...........T...........
A1-5  ...........T...........
B2-1  ...........G...........
B2-2  ...........G...........
B2-3  ...........G...........
B2-4  ...........G...........
      </pre><p>
	Depending on base qualities and settings used during the assembly
	like, e.g., [-CO:mr:mrpg:mnq:mgqrt:emea:amgb] MIRA will
	recognise that there's enough evidence for a T and also enough
	evidence for a G at that position and create two contigs, one
	containing the "T" allele, one the "G". The consensus will be &gt;99%
	identical, but not 100%.
      </p><p>
	Things become complicated if one has to account for errors in
	sequencing. Imagine you sequenced the following case:
      </p><pre class="screen">
A1-1  ...........T...........
A1-2  ...........T...........
A1-3  ...........T...........
A1-4  ...........T...........
A1-5  ...........T...........
B2-1  ...........<span class="bold"><strong>G</strong></span>...........
      </pre><p>
	It shows very much the same like the one from above, except that
	there's only one read with a "G" instead of 4 reads. MIRA will, when
	using standard settings, treat this as erroneous base and leave all
	these reads in a contig. It will likewise also not mark it as SNP in
	the results. However, this could also very well be a lowly expressed
	transcript with a single base mutation. It's virtually impossible to
	tell which of the possibilities is right.
      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	You can of course force MIRA to mark situations like the one depicted
	above by, e.g., changing the parameters
	for [-CO:mrpg:mnq:mgqrt]. But this may have the side-effect
	that sequencing errors get an increased chance of getting flagged as
	SNP.
      </td></tr></table></div><p>
	Further complications arise when SNPs and potential sequencing errors
	meet at the same place. consider the following case:
      </p><pre class="screen">
A1-1  ...........T...........
A1-2  ...........T...........
A1-3  ...........T...........
A1-4  ...........T...........
B1-5  ...........T...........
B2-1  ...........G...........
B2-2  ...........G...........
B2-3  ...........G...........
B2-4  ...........G...........
E1-1  ...........<span class="bold"><strong>A</strong></span>...........
      </pre><p>
	This example is exactly like the first one, except an additional read
	<code class="literal">E1-1</code> has made it's appearance and has an "A"
	instead of a "G" or "T". Again it is impossible to tell whether this
	is a sequencing error or a real SNP. MIRA handles these cases in the
	following way: it will recognise two valid read groups (one having a
	"T", the other a "G") and, in assembly mode, split these two groups
	into different contigs. It will also play safe and define that the
	single read <code class="literal">E1-1</code> will not be attributed to either
	one of the contigs but, if it cannot be assembled to other reads, form
	an own contig ... if need to be even only as single read (a
	<span class="emphasis"><em>singlet</em></span>).
      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	Depending on some settings, singlets may either appear in the regular
	results or end up in the debris file.
      </td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_est_gap_splitting"></a>7.4.2. 
	Splitting transcripts into contigs based on larger gaps
      </h3></div></div></div><p>
	Gaps in alignments of transcripts are handled very cautiously by
	MIRA. The standard settings will lead to the creation of different
	contigs if three or more consecutive gaps are introduced in an
	alignment. Consider the following example:
      </p><pre class="screen">
A1-1  ..........CGA..........
A1-2  ..........*GA..........
A1-3  ..........**A..........
B2-1  ..........<span class="bold"><strong>***</strong></span>..........
B2-2  ..........<span class="bold"><strong>***</strong></span>..........
      </pre><p>
	Under normal circumstances, MIRA will use the reads
	<code class="literal">A1-1</code>, <code class="literal">A1-2</code> and
	<code class="literal">A1-3</code> to form one contig and put
	<code class="literal">B2-1</code> and <code class="literal">B2-2</code> into a separate
	contig. MIRA would do this also if there were only one of the B2
	reads.
      </p><p>
	The reason behind this is that the probability for having gaps of
	three or more bases only due to sequencing errors is pretty
	low. MIRA will therefore treat reads with such attributes as coming
	from different transcripts and not assemble them together, though
	this can be changed using the [-AL:egp:egpl] parameters of
	MIRA if wanted.
      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning: 
	  Problems with homopolymers, especially in 454 sequencing
	"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">
	  Problems with homopolymers, especially in 454 sequencing
	</th></tr><tr><td align="left" valign="top"><p>
	  As 454 sequencing has a general problem with homopolymers, this rule
	  of MIRA will sometimes lead formation of more contigs than expected
	  due to sequencing errors at "long" homopolymer sites ... where long
	  starts at ~7 bases. Though MIRA does know about the problem in 454
	  homopolymers and has some routines which try to mitigate the
	  problem. this is not always successful.
	</p></td></tr></table></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect1_est_mira_and_mirasearchestsnps"></a>7.5. 
      mira and miraSearchESTSNPs
    </h2></div></div></div><p>
    </p><p>
      The assembly of ESTS can be done in two ways when using the MIRA 4 system: by
      using mira or miraSearchESTSNPs.
    </p><p>
      If one has data from only one strain, mira using the "--job=est"
      quickmode switch is probably the way to go as it's easier to handle.
    </p><p>
      For data from multiple strains where one wants to search SNPs,
      miraSearchESTSNPs is the tool of choice. It's an automated pipeline
      that is able to assemble transcripts cleanly according to given organism
      strains. Afterwards, an integrated SNP analysis highlights the exact nature of
      mutations within the transcripts of different strains.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_using_mira_for_est_assembly"></a>7.5.1. 
	Using mira for EST assembly
      </h3></div></div></div><p>
	Using mira in EST projects is quite useful to get a first impression of
	a given data set or when used in projects that have no strain or only one
	strain.
      </p><p>
	It is recommended to use 'est' in the [-job=] quick switch to get a
	good initial settings default and then eventually adapt with own settings.
      </p><p>
	Note that by their nature, single transcripts end up in the debris file as
	they do not match any other reads and therefore cannot be aligned.
      </p><p>
	An interesting approach to find differences in multiploid genes is to use the
	result of an "mira --job=est ..." assembly as input for the third step of the
	miraSearchESTSNPs pipeline.
	
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_using_mira_for_est_clustering"></a>7.5.2. 
	Using mira for EST clustering
      </h3></div></div></div><p>
	Like for EST assembly, it is recommended to use 'est' in the
	[-job=] quick switch to get a good initial settings
	default. Then however, one should adapt a couple of switches to get a
	clustering like alignment:
      </p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	    <code class="filename">-AL:egp=no</code>
	  </span></dt><dd><p>
	      switching off extra gap penalty in alignments allows assembly of
	      transcripts having gap differences of more than 3 bases
	    </p></dd><dt><span class="term">
	    <code class="filename">-AL:egpl=...</code>
	  </span></dt><dd><p>
	      In case [-AL:egp] is not switched off, the extra gap
	      penalty level can be fine tuned here.
	    </p></dd><dt><span class="term">
	    <code class="filename">-AL:megpp=...</code>
	  </span></dt><dd><p>
	      In case [-AL:egp] is not switched off, the maximum
	      extra gap penalty in percentage can be fine tuned here. This
	      allows, together with  [-AL:egpl] (see below), to have
	      MIRA accept alignments which are two or three bases longer than
	      the 3 bases rejection criterion of the standard
	       [-AL:egpl=split_on_codongaps] in EST assemblies.
	    </p></dd><dt><span class="term">
	    <code class="filename">-CO:asir=yes</code>
	  </span></dt><dd><p>
	      This forces MIRA to assume that valid base differences (occurring
	      in several reads) in alignments are SNPs and not repeats/marker
	      bases for different variants. Note that depending on whether you
	      have only one or several strains in your assembly, you might
	      want to enable or disable this feature to allow/disallow
	      clustering of reads from different strains.
	    </p></dd><dt><span class="term">
	    <code class="filename">-CO:mrpg:mnq:mgqrt</code>
	  </span></dt><dd><p>
	      With these three parameters you can adjust the sensitivity of
	      the repeat / SNP discovery algorithm.
	    </p></dd><dt><span class="term">
	    <code class="filename">-AL:mrs=...</code>
	  </span></dt><dd><p>
	      When [-CO:asir=no] and  [-AL:egp=no], MIRA has
	      lost two of its most potent tools to not align complete
	      nonsense. In those cases, you should increase the minimum
	      relative score allowed in Smith-Waterman alignments to levels
	      which are higher than the usual MIRA standards. 90 or 95 might
	      be a good start for testing.
	    </p></dd><dt><span class="term">
	    <code class="filename">-CO:rodirs=...</code>
	  </span></dt><dd><p>
	      Like [-AL:mrs] above,  [-CO:rodirs] is a fall-back
	      mechanism to disallow building of completely nonsensical contigs
	      when  [-CO:asir=no] and  [-AL:egp=no]. You should decrease  [-CO:rodirs] to anywhere between 10 and 0.
	    </p></dd></dl></div><p>
	Please look up the complete description of the above mentioned
	parameters in the MIRA reference manual, they're listed here just with
	the <span class="emphasis"><em>why</em></span> one should change them for a clustering
	assembly.
      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      Remember that some of the parameters above can be set independently for
      reads of different sequencing technologies. E.g., when assembling EST
      sequences from <span class="emphasis"><em>Sanger</em></span> and <span class="emphasis"><em>454</em></span>
      sequencing technologies, it is absolutely possible to allow the 454
      sequences from having large gaps in alignments (to circumvent the
      homopolymer problem), but to disallow Sanger sequences from having
      them. The parameters would need be set like this:
      <pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>mira [...] --job=est,... [...]
  SANGER_SETTINGS -AL:egp=yes:egpl=split_on_codongaps
  454_SETTINGS -AL:egp=no</code></strong></pre>
      or in shorter form (as <code class="literal">--job=est</code> already presets
      <code class="literal">-AL:egp=yes:egpl=split_on_codongaps</code> for all
      technologies):
      <pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>mira [...] --job=est,... [...]
  454_SETTINGS -AL:egp=no</code></strong>
      </pre></td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_using_mirasearchestsnps_for_est_assembly"></a>7.5.3. 
	Using miraSearchESTSNPs for EST assembly
      </h3></div></div></div><p>
	miraSearchESTSNPs is a pipeline that reconstructs the pristine mRNA
	transcript sequences gathered in EST sequencing projects of more than
	one strain, which can be a reliable basis for subsequent analysis
	steps like clustering or exon analysis.  This means that even genes
	that contain only one transcribed SNP on different alleles are first
	treated as different transcripts. The optional last step of the
	assembly process can be configured as a simple clusterer that can
	assemble transcripts containing the same exon sequence -- but only
	differ in SNP positions -- into one consensus sequence. Such SNPs can
	then be analysed, classified and reliably assigned to their
	corresponding mRNA transcriptome sequence. However, it is important to
	note that miraSearchESTSNPs is an assembler and not a full blown
	clustering tool.
      </p><p>
	Generally speaking, miraSearchESTSNPs is a three-stage assembly system
	that was designed to catch SNPs in different strains and reconstruct
	the mRNA present in those strains. That is, one really should have
	different strains to analyse (and the information provided to the
	assembler) to make the most out of miraSearchESTSNPs. Here is a quick
	overview on what miraSearchESTSNPs does:
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    Step 1: assemble everything together, not caring about strain
	    information.  Potential SNPs are not treated as SNPs, but as
	    possible repeat marker bases and are tagged as such (temporarily)
	    to catch each and every possible sequence alignment which might be
	    important later. As a result of this stage, the following
	    information is written out:
	    </p><div class="orderedlist"><ol class="orderedlist" type="a"><li class="listitem"><p>
		  Into <code class="filename">step1_snpsinSTRAIN_&lt;strain_name&gt;.caf</code>
		  all the sequences of a given strain that are in contigs (can
		  be aligned with at least one other sequence) - also, all
		  sequences that are singlets BUT have been tagged previously
		  as containing tagged bases showing that they aligned
		  previously (even to other strains) but were torn apart due
		  to the SNP bases.
		</p></li><li class="listitem"><p>
		  Into <code class="filename">step1_nosnps_remain.caf</code> all the
		  remaining singlets.
		</p></li></ol></div><p>
	    Obviously, if one did not provide strain information to the
	    assembly of step 1, all the sequences belong to the same strain
	    (named <span class="emphasis"><em>"default"</em></span>). The CAF files generated in
	    this step are the input sequences for the next step.
	  </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	    If you want to apply clippings to your data (poly-A/T or reading
	    clipping information from SSAHA2 or SMALT), then do this only in
	    step 1! Do not try to re-appply them in step 2 or 3 (or only if
	    you think you have very good reasons to do so. Once loaded and/or
	    applied in step 1, the clipping information is carried on by MIRA
	    to steps 2 and 3.
	  </td></tr></table></div></li><li class="listitem"><p>
	    Step 2: Now, miraSearchESTSNPs assembles each strain independently
	    from each other.  Again, sequences containing SNPs are torn apart
	    into different contigs (or singlets) to give a clean
	    representation of the "really sequenced" ESTs. In the end, each of
	    the contigs (or singlets) coming out of the assemblies for the
	    strains is a representation of the mRNA that was floating around
	    the given cell/strain/organism. The results of this step are
	    written out into one big file
	    (<code class="filename">step2_reads.caf</code>) and a new straindata file
	    that goes along with those results
	    (<code class="filename">step2_straindata.txt</code>).
	  </p></li><li class="listitem"><p>
	    Step 3: miraSearchESTSNPs takes the result of the previous step
	    (which should now be clean transcripts) and assembles them
	    together, <span class="emphasis"><em>this time</em></span> allowing transcripts from
	    different strains with different SNP bases to be assembled
	    together. The result is then written to
	    <code class="filename">step3_out.*</code> files and directories.
	  </p></li></ol></div><p>
      </p><p>
	miraSearchESTSNPs can also be used for EST data of a single strain or
	when no strain information is available. In this case, it will cleanly
	sort out transcripts of almost identical genes or, when eukaryotic
	ESTs are assembled, according to their respective allele when these
	contain mutations.
      </p><p>
	Like the normal mira, miraSearchESTSNPs keeps track on a lot of things
	and writes out quite a lot of additional information files after each
	step.  Results and and additional information of step 1 are stored in
	<code class="filename">step1_*</code> directories. Results and information of
	step 2 are in <code class="filename">&lt;strain_name&gt;_*</code>
	directories. For step 3, it's <code class="filename">step3_*</code> again.
      </p><p>
	Each step of miraSearchESTSNPs can be configured exactly like mira via
	command line parameters.
      </p><p>
	The pipeline of miraSearchESTSNPs is almost as flexible as mira
	itself: if the defaults set by the quick switches are not right for
	your use case, you can change about any parameter you wish via the
	command line. There are only two things which you need to pay
	attention to
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    a straindata file must be present for step 1
	    (<code class="filename">*_straindata_in.txt</code>), but it can very well
	    be an empty file.
	  </p></li><li class="listitem"><p>
	    the naming of the result files is fixed (for all three steps), you
	    cannot change it.
	  </p></li></ol></div><p>
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="idp42648112"></a>7.6. 
      Solving common problems of EST assemblies
    </h2></div></div></div><p>
      ... continue here ...
    </p><p>
      Megahubs =&gt; track down reason (high expr, seqvec or adaptor: see
      mira_hard) and eliminate it
    </p></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_specialparams"></a>Chapter 8. Parameters for special situations</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_sp_introduction">8.1. 
      Introduction
    </a></span></dt><dt><span class="sect1"><a href="#sect_sp_pacbio">8.2. 
      PacBio
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_sp_pacbio_ccs">8.2.1. 
	PacBio CCS reads
      </a></span></dt><dt><span class="sect2"><a href="#sect_sp_pacbio_ec">8.2.2. 
	PacBio error corrected reads
      </a></span></dt></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">... .
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_sp_introduction"></a>8.1. 
      Introduction
    </h2></div></div></div><p>
      Most of this chapter and many sections are just stubs at the moment.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_sp_pacbio"></a>8.2. 
      PacBio
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_sp_pacbio_ccs"></a>8.2.1. 
	PacBio CCS reads
      </h3></div></div></div><p>
	Declare the sequencing technology to be high-quality PacBio (<span class="bold"><strong>PCBIOHQ</strong></span>). The last time I worked with CCS, the
	ends of the reads were not really clean, so using the proposed end
	clipping (which needs to be manually switched on for PCBIOHQ reads)
	may be advisable.
      </p><pre class="screen"><strong class="userinput"><code>...
parameters = PCBIOHQ_SETTINGS -CL:pec=yes
...

readgroup
technology=pcbiohq
data=...
...</code></strong></pre></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_sp_pacbio_ec"></a>8.2.2. 
	PacBio error corrected reads
      </h3></div></div></div><p>
	Declare the sequencing technology to be high-quality PacBio (<span class="bold"><strong>PCBIOHQ</strong></span>). For self-corrected data or data
	corrected with other sequencing technologies, it is recommended to
	change the  [-CO:mrpg] setting to a value which is 1/4th to
	1/5th of the average coverage of the corrected PacBio reads across the
	genome. E.g.:
      </p><pre class="screen"><strong class="userinput"><code>...
parameters = PCBIOHQ_SETTINGS -CO:mrpg=5
...

readgroup
technology=pcbiohq
data=...
...</code></strong></pre><p>
	for a project which has ~24x coverage. This necessity may change in
	later versions of MIRA though.
      </p></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_results"></a>Chapter 9. Working with the results of MIRA</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_res_looking_at_results">9.1. 
      MIRA output directories and files
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_res_resultsdir">9.1.1. 
	The <code class="filename">*_d_results</code> directory
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_res_resultsdir_denovo">9.1.1.1. 
	  Additional 'large contigs' result files for de-novo assemblies of genomes
	</a></span></dt></dl></dd><dt><span class="sect2"><a href="#sect_res_infodir">9.1.2. 
	The <code class="filename">*_d_info</code> directory
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_res_first_look:the_assembly_info">9.2. 
      First look: the assembly info
    </a></span></dt><dt><span class="sect1"><a href="#sect_res_converting_results">9.3. 
      Converting results
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_res_converting_miraconvert">9.3.1. 
	Converting to and from other formats:<span class="command"><strong>miraconvert</strong></span>
      </a></span></dt><dt><span class="sect2"><a href="#sect_res_converting_reach_other_programs">9.3.2. 
	Steps for converting data from / to other tools
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_res_converting_to_from_staden">9.3.2.1. 
	  Example: converting to and from the Staden package (gap4 / gap5)
	</a></span></dt><dt><span class="sect3"><a href="#sect_res_converting_to_from_sam">9.3.2.2. 
	  Example: converting to and from SAM (for samtools, tablet etc.)
	</a></span></dt></dl></dd></dl></dd><dt><span class="sect1"><a href="#sect_res_filtering_of_results">9.4. 
      Filtering results
    </a></span></dt><dt><span class="sect1"><a href="#sect_res_places_of_importance_in_a_de_novo_assembly">9.5. 
      Places of importance in a de-novo assembly
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_res_tags_set_by_mira">9.5.1. 
	Tags set by MIRA
      </a></span></dt><dt><span class="sect2"><a href="#sect_res_other_places_of_importance">9.5.2. 
	Other places of importance
      </a></span></dt><dt><span class="sect2"><a href="#sect_res_joining_contigs">9.5.3. 
	Joining contigs
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_res_joining_truerepeats">9.5.3.1. 
	  Joining contigs at true repetitive sites
	</a></span></dt><dt><span class="sect3"><a href="#sect_res_joining_FALSErepeats">9.5.3.2. 
	  Joining contigs at "wrongly discovered" repetitive sites
	</a></span></dt></dl></dd></dl></dd><dt><span class="sect1"><a href="#sect_res_places_of_interest_in_a_mapping_assembly">9.6. 
      Places of interest in a mapping assembly
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_res_poi_where_are_snps?">9.6.1. 
	Where are SNPs?
      </a></span></dt><dt><span class="sect2"><a href="#sect_res_poi_where_are_insertions_deletions_or_genome_rearrangements?">9.6.2. 
	Where are insertions, deletions or genome re-arrangements?
      </a></span></dt><dt><span class="sect2"><a href="#sect_res_poi_other_tags_of_interest">9.6.3. 
	Other tags of interest
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_res_postprocessing_mapping_assemblies">9.7. 
      Post-processing mapping assemblies
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_res_pp_manual_cleanup">9.7.1. 
	Manual cleanup and validation (optional)
      </a></span></dt><dt><span class="sect2"><a href="#sect_res_poi_comprehensive_snp_analysis_spreadsheet_tables_for_excel_or_oocalc">9.7.2. 
	Comprehensive SNP analysis spreadsheet tables (for Excel or OOcalc)
      </a></span></dt><dt><span class="sect2"><a href="#sect_res_poi_html_files_depicting_snp_positions_and_deletions">9.7.3. 
	HTML files depicting SNP positions and deletions
      </a></span></dt><dt><span class="sect2"><a href="#sect_res_poi_wig_files">9.7.4. 
	WIG files depicting contig coverage or GC content
      </a></span></dt><dt><span class="sect2"><a href="#sect_res_poi_tables_for_feature_coverage">9.7.5. 
	Comprehensive spreadsheet tables for gene expression values / genome deletions &amp; duplications
      </a></span></dt></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">You have to know what you're looking for before you can find it.
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><p>
    MIRA makes results available in quite a number of formats: CAF, ACE, FASTA and
    a few others. The preferred formats are CAF and MAF, as these format can be
    translated into any other supported format.
  </p><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_res_looking_at_results"></a>9.1. 
      MIRA output directories and files
    </h2></div></div></div><p>
      For the assembly MIRA creates a directory named
      <code class="filename"><em class="replaceable"><code>projectname</code></em>_assembly</code> in
      which a number of sub-directories will have appeared.
    </p><p>
      These sub-directories (and files within) contain the results of the
      assembly itself, general information and statistics on the results and
      -- if not deleted automatically by MIRA -- a tmp directory with log
      files and temporary data:
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	  <code class="filename"><em class="replaceable"><code>projectname</code></em>_d_results</code>:
	  this directory contains all the output files of the assembly in
	  different formats.
	</p></li><li class="listitem"><p>
	  <code class="filename"><em class="replaceable"><code>projectname</code></em>_d_info</code>:
	  this directory contains information files of the final
	  assembly. They provide statistics as well as, e.g., information
	  (easily parsable by scripts) on which read is found in which
	  contig etc.
	</p></li><li class="listitem"><p>
	  <code class="filename"><em class="replaceable"><code>projectname</code></em>_d_tmp</code>:
	  this directory contains log files and temporary assembly files. It
	  can be safely removed after an assembly as there may be easily a
	  few GB of data in there that are not normally not needed anymore.
	</p><p>
	  The default settings of MIRA are such that really big files are
	  automatically deleted when they not needed anymore during an
	  assembly.
	</p></li><li class="listitem"><p>
	  <code class="filename"><em class="replaceable"><code>projectname</code></em>_d_chkpt</code>:
	  this directory contains checkpoint files needed to resume
	  assemblies that crashed or were stopped.
	</p></li></ul></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_resultsdir"></a>9.1.1. 
	The <code class="filename">*_d_results</code> directory
      </h3></div></div></div><p>
	The following files in
	<code class="filename"><em class="replaceable"><code>projectname</code></em>_d_results</code>
	contain results of the assembly in different formats. Depending on the
	output options you defined for MIRA, some files may or may not be
	there. As long as the CAF or MAF format are present, you can translate
	your assembly later on to about any supported format with the
	<span class="command"><strong>miraconvert</strong></span> program supplied with the MIRA
	distribution:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_out.txt</code>:
	    this file contains in a human readable format the aligned assembly
	    results, where all input sequences are shown in the context of the
	    contig they were assembled into. This file is just meant as a
	    quick way for people to have a look at their assembly without
	    specialised alignment finishing tools.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_out.padded.fasta</code>:
	    this file contains as FASTA sequence the consensus of the contigs
	    that were assembled in the process.  Positions in the consensus
	    containing gaps (also called 'pads', denoted by an asterisk) are
	    still present. The computed consensus qualities are in the
	    corresponding
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_out.padded.fasta.qual</code>
	    file.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_out.unpadded.fasta</code>:
	    as above, this file contains as FASTA sequence the consensus of
	    the contigs that were assembled in the process, put positions in
	    the consensus containing gaps were removed. The computed consensus
	    qualities are in the corresponding
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_out.unpadded.fasta.qual</code>
	    file.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_out.caf</code>:
	    this is the result of the assembly in CAF format, which can be
	    further worked on with, e.g., tools from the
	    <span class="emphasis"><em>caftools</em></span> package from the Sanger Centre and
	    later on be imported into, e.g., the Staden gap4 assembly and
	    finishing tool.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_out.ace</code>:
	    this is the result of the assembly in ACE format. This format can
	    be read by viewers like the TIGR clview or by consed from the
	    phred/phrap/consed package.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_out.gap4da</code>:
	    this directory contains the result of the assembly suited for the
	    <span class="emphasis"><em>direct assembly</em></span> import of the Staden gap4
	    assembly viewer and finishing tool.
	  </p></li></ul></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_res_resultsdir_denovo"></a>9.1.1.1. 
	  Additional 'large contigs' result files for de-novo assemblies of genomes
	</h4></div></div></div><p>
	  For de-novo assemblies of genomes, MIRA makes a proposal regarding
	  which contigs you probably want to have a look at ... and which ones
	  you can probably forget about.
	</p><p>
	  This proposal relies on the <span class="emphasis"><em>largecontigs</em></span> file
	  in the info directory (see section below) and MIRA automatically
	  extracted these contigs into all the formats you wanted to have your
	  results in.
	</p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	      The result files for 'large contigs' are all named:
	      <code class="filename"><em class="replaceable"><code>projectname</code></em>_<span class="emphasis"><em>LargeContigs</em></span>_out.<em class="replaceable"><code>resulttype</code></em></code>:
	    </p></li><li class="listitem"><p>
	      <code class="filename">extractLargeContigs.sh</code>: this is a small
	      shell script which just contains the call
	      to <span class="command"><strong>miraconvert</strong></span> with which MIRA extracted the
	      large contigs for you. In case you want to redefine what large
	      contigs are for you, feel free to use this as template.
	    </p></li></ul></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_infodir"></a>9.1.2. 
	The <code class="filename">*_d_info</code> directory
      </h3></div></div></div><p>
	The following files in
	<code class="filename"><em class="replaceable"><code>projectname</code></em>_info</code>
	contain statistics and other information files of the assembly:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_assembly.txt</code>:
	    This file should be your first stop after an assembly. It will
	    tell you some statistics as well as whether or not problematic
	    areas remain in the result.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_callparameters.txt</code>:
	    This file contains the parameters as given on the mira command
	    line when the assembly was started.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_consensustaglist.txt</code>:
	    This file contains information about the tags (and their position)
	    that are present in the consensus of a contig.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_contigreadlist.txt</code>:
	    This file contains information which reads have been assembled
	    into which contigs (or singlets).
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_contigstats.txt</code>:
	    This file contains in tabular format statistics about the contigs
	    themselves, their length, average consensus quality, number of
	    reads, maximum and average coverage, average read length, number
	    of A, C, G, T, N, X and gaps in consensus.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_debrislist.txt</code>:
	    This file contains the names of all the reads which were not
	    assembled into contigs (or singlets if appropriate MIRA parameters
	    were chosen). The file has two columns: first column is the name
	    of the read, second column is a code showing the reason and stage
	    at which the read was put into the debris category.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_largecontigs.txt</code>:
	    This file contains as simple list the names of all the contigs
	    MIRA thinks to be more or less important at the end of the
	    assembly. To be present in this list, a contig needed to reach a
	    certain length (usually 500, but see  [-MI:lcs]) and had a
	    coverage of at least 1/3 of the average coverage (per sequencing
	    technology) of the complete project.
	  </p><p>
	    Note: only present for de-novo assemblies of genomes.
	  </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	    The default heuristics (500bp length and 1/3 coverage per
	    sequencing technology) generally work well enough for most
	    projects. However, Projects with extremely different coverage
	    numbers per sequencing technology may need to use different
	    numbers. E.g.: a project with 80x Illumina and 6x Sanger would
	    have contigs consisting only of 2 or 3 Sanger sequence but with
	    the average coverage &gt;= 2 also in this list although clearly no
	    one would look at these under normal circumstances.
	  </td></tr></table></div></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_groups.txt</code>:
	    This file contains information about readgroups as determined by
	    MIRA. Most interesting will probably be statistics concerning
	    read-pair sizes.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_readrepeats</code>:
	    This file helps to find out which parts of which reads are quite
	    repetitive in a project. Please consult the chapter on how to
	    tackle "hard" sequencing projects to learn how this file can help
	    you in spotting sequencing mistakes and / or difficult parts in a
	    genome or EST / RNASeq project.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_readstooshort</code>:
	    A list containing the names of those reads that have been sorted
	    out of the assembly only due to the fact that they were too short,
	    before any processing started.
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_info_readtaglist.txt</code>:
	    This file contains information about the tags and their position
	    that are present in each read.  The read positions are given
	    relative to the forward direction of the sequence (i.e. as it was
	    entered into the the assembly).
	  </p></li><li class="listitem"><p>
	    <code class="filename"><em class="replaceable"><code>projectname</code></em>_error_reads_invalid</code>:
	    A list of sequences that have been found to be invalid due to
	    various reasons (given in the output of the assembler).
	  </p></li></ul></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_res_first_look:the_assembly_info"></a>9.2. 
      First look: the assembly info
    </h2></div></div></div><p>
      Once finished, have a look at the file
      <code class="filename">*_info_assembly.txt</code> in the info directory. The
	assembly information given there is split in three major parts:
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  some general assembly information (number of reads assembled etc.). This
	    part is quite short at the moment, will be expanded in future
	</p></li><li class="listitem"><p>
	  assembly metrics for 'large' contigs.
	</p></li><li class="listitem"><p>
	  assembly metrics for all contigs.
	  </p></li></ol></div><p>
      The first part for large contigs contains several sections. The first of
      these shows what MIRA counts as large contig for this particular
      project. As example, this may look like this:
    </p><pre class="screen">
Large contigs:
--------------
With    Contig size             &gt;= 500
        AND (Total avg. Cov     &gt;= 19
             OR Cov(san)        &gt;= 0
             OR Cov(454)        &gt;= 8
             OR Cov(pbs)        &gt;= 0
             OR Cov(sxa)        &gt;= 11
             OR Cov(sid)        &gt;= 0
            )</pre><p>
      The above is for a 454 and Solexa hybrid assembly in which MIRA
      determined large contigs to be contigs
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  of length of at least 500 bp and
	</p></li><li class="listitem"><p>
	  having a total average coverage of at least 19x or an
	  average 454 coverage of 8 or an average Solexa coverage of 11
	</p></li></ol></div><p>
      The second section is about length assessment of large contigs:
    </p><pre class="screen">
  Length assessment:
  ------------------
  Number of contigs:    44
  Total consensus:      3567224
  Largest contig:       404449
  N50 contig size:      186785
  N90 contig size:      55780
  N95 contig size:      34578</pre><p>
      In the above example, 44 contigs totalling 3.56 megabases were built,
      the largest contig being 404 kilobases long and the N50/N90 and N95
      numbers give the respective lengths.
    </p><p>
      The next section shows information about the coverage assessment of
      large contigs. An example:
    </p><pre class="screen">
  Coverage assessment:
  --------------------
  Max coverage (total): 563
  Max coverage
        Sanger: 0
        454:    271
        PacBio: 0
        Solexa: 360
        Solid:  0
  Avg. total coverage (size &gt;= 5000): 57.38
  Avg. coverage (contig size &gt;= 5000)
        Sanger: 0.00
        454:    25.10
        PacBio: 0.00
        Solexa: 32.88
        Solid:  0.00</pre><p>
      Maximum coverage attained was 563, maximum for 454 alone 271 and for
      Solexa alone 360. The average total coverage (computed from contigs with
      a size &#8805; 5000 bases is 57.38. The average coverage by sequencing
      technology (in contigs &#8805; 5000) is 25.10 for 454 and 32.88 for Solexa
      reads.
    </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	For genome assemblies, the value for <span class="emphasis"><em>Avg. total coverage
	(size &gt;= 5000)</em></span> is currently always calculated for contigs
	having 5000 or more consensus bases. While this gives a very effective
	measure for genome assemblies, assemblies of EST or RNASeq will often
	have totally irrelevant values here: even if the default of MIRA is to
	use smaller contig sizes (1000) for EST / RNASeq assemblies, the
	coverage values for lowly and highly expressed genes can easily span a
	factor of 10000 or more.
      </p></td></tr></table></div><p>
      The last section contains some numbers useful for quality assessment. It
      looks like this:
    </p><pre class="screen">
  Quality assessment:
  -------------------
  Average consensus quality:                    90
  Consensus bases with IUPAC:                   11      (you might want to check these)
  Strong unresolved repeat positions (SRMc):    0       (excellent)
  Weak unresolved repeat positions (WRMc):      19      (you might want to check these)
  Sequencing Type Mismatch Unsolved (STMU):     0       (excellent)
  Contigs having only reads wo qual:            0       (excellent)
  Contigs with reads wo qual values:            0       (excellent)</pre><p>
      Beside the average quality of the contigs and whether they contain reads
      without quality values, MIRA shows the number of different tags in the
      consensus which might point at problems.
    </p><p>
      The above mentioned sections (length assessment, coverage assessment and
      quality assessment) for <span class="emphasis"><em>large</em></span> contigs will then be
      re-iterated for <span class="emphasis"><em>all</em></span> contigs, this time including
      also contigs which MIRA did not take into account as large contig.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_res_converting_results"></a>9.3. 
      Converting results
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_converting_miraconvert"></a>9.3.1. 
	Converting to and from other formats:<span class="command"><strong>miraconvert</strong></span>
      </h3></div></div></div><p>
	<span class="command"><strong>miraconvert</strong></span> is tool in the MIRA package which
	reads and writes a number of formats, ranging from full assembly
	formats like CAF and MAF to simple output view formats like HTML or
	plain text.
      </p><div class="figure"><a name="chap_res::results_miraconvert.png"></a><p class="title"><b>Figure 9.1. <span class="command">miraconvert</span> supports a wide range of
	format conversions to simplify export / import of results to and from
	other programs</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/results_miraconvert.png" width="100%" alt="miraconvert supports a wide range of format conversions to simplify export / import of results to and from other programs"></td></tr></table></div></div></div><br class="figure-break"></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_converting_reach_other_programs"></a>9.3.2. 
	Steps for converting data from / to other tools
      </h3></div></div></div><p>
	The question "How Do I convert to / from other tools?" is complicated
	by the plethora of file formats and tools available. This section
	gives an overview on what is needed to reach the most important ones.
      </p><div class="figure"><a name="chap_res::results_mira2other.png"></a><p class="title"><b>Figure 9.2. 
	  Conversion steps, formats and programs needed to reach some tools
	  like assembly viewers, editors or scaffolders.
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/results_mira2other.png" width="100%" alt="Conversion steps, formats and programs needed to reach some tools like assembly viewers, editors or scaffolders."></td></tr></table></div></div></div><br class="figure-break"><p>
	Please also read the chapter on MIRA utilities in this manual to learn
	more on <span class="command"><strong>miraconvert</strong></span> and have a look at
	<code class="literal">miraconvert -h</code> which lists all possible formats
	and other command line options.
      </p><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_res_converting_to_from_staden"></a>9.3.2.1. 
	  Example: converting to and from the Staden package (gap4 / gap5)
	</h4></div></div></div><p>
	  The <span class="command"><strong>gap4</strong></span> program (and its
	  successor <span class="command"><strong>gap5</strong></span> from the Staden package are pretty
	  useful finishing tools and assembly viewers. They have an own
	  database format which MIRA does not read or write, but there are
	  interconversion possibilities using the CAF format (for gap4) and
	  SAM format (for gap5)
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      [gap4]
	    </span></dt><dd><p>
		You need the <span class="command"><strong>caf2gap</strong></span>
		and <span class="command"><strong>gap2caf</strong></span> utilities for this, which are
		distributed separately from the Sanger Centre
		<a class="ulink" href="http://www.sanger.ac.uk/Software/formats/CAF/" target="_top">http://www.sanger.ac.uk/Software/formats/CAF/</a>).
		Conversion is pretty straightforward. From MIRA to gap4, it's
		like this:
	      </p><pre class="screen">
<code class="prompt">$</code> caf2gap -project <em class="replaceable"><code>YOURGAP4PROJECTNAME</code></em> -ace <em class="replaceable"><code>mira_result.caf</code></em> &gt;&amp;/dev/null</pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Don't be fooled by the <code class="literal">-ace</code> parameter of
		<span class="command"><strong>caf2gap</strong></span>. It needs a CAF file as input, not
		an ACE file.
	      </td></tr></table></div><p>
		From gap4 to CAF, it's like this:
	      </p><pre class="screen">
<code class="prompt">$</code> gap2caf -project <em class="replaceable"><code>YOURGAP4PROJECTNAME</code></em> &gt;tmp.caf
<code class="prompt">$</code> miraconvert -r c tmp.caf <em class="replaceable"><code>somenewname</code></em>.caf</pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Using <span class="command"><strong>gap2caf</strong></span>, be careful to use the simple
		<code class="literal">&gt;</code> redirection to file and
		<span class="emphasis"><em>not</em></span> the <code class="literal">&gt;&amp;</code>
		redirection.
	      </td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Using first <span class="command"><strong>gap2caf</strong></span> and then
		<span class="command"><strong>miraconvert</strong></span> is needed as gap4 writes an
		own consensus to the CAF file which is not necessarily the
		best. Indeed, gap4 does not know about different sequencing
		technologies like 454 and treats everything as
		Sanger. Therefore, using
		<span class="command"><strong>miraconvert</strong></span> with the  [-r c] option
		recalculates a MIRA consensus during the "conversion" from CAF to CAF.
	      </td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		If you work with a 32 bit executable of caf2gap, it might very
		well be that the converter needs more memory than can be
		handled by 32 bit. Only solution: switch to a 64 bit
		executable of caf2gap.
	      </td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning: 
		  caf2gap bug for sequence annotations in reverse direction
		"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">
		  caf2gap bug for sequence annotations in reverse direction
		</th></tr><tr><td align="left" valign="top"><p>
		  caf2gap has currently (as of version 2.0.2) a bug that turns
		  around all features in reverse direction during the
		  conversion from CAF to a gap4 project. There is a fix
		  available, please contact me for further information (until
		  I find time to describe it here).
		</p></td></tr></table></div></dd><dt><span class="term">
	      [gap5]
	    </span></dt><dd><p>
		The <span class="command"><strong>gap5</strong></span> program is the successor for
		gap4. It comes with on own import utility
		(<span class="command"><strong>tg_index</strong></span>) which can import SAM and CAF
		files, and gap5 itself has an export function which also
		writes SAM and CAF. It is suggested to use the SAM format to
		export data gap5 as it is more efficient and conveys more
		information on sequencing technologies used.
	      </p><p>
		Conversion is pretty straightforward. From MIRA to gap5, it's like
		this:
	      </p><pre class="screen">
<code class="prompt">$</code> tg_index <em class="replaceable"><code>INPUT</code></em>_out.sam</pre><p>
		This creates a gap5 database named
		<code class="filename"><em class="replaceable"><code>INPUT</code></em>_out.g5d</code>
		which can be directly loaded with gap5 like this:
	      </p><pre class="screen">
<code class="prompt">$</code> gap5 <em class="replaceable"><code>INPUT</code></em>_out.g5d</pre><p>
		Exporting back to SAM or CAF is done in gap5 via
		the <span class="emphasis"><em>File-&gt;Export Sequences</em></span> menu there.
	      </p></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_res_converting_to_from_sam"></a>9.3.2.2. 
	  Example: converting to and from SAM (for samtools, tablet etc.)
	</h4></div></div></div><p>
	  Converting to SAM is done by
	  using <span class="command"><strong>miraconvert</strong></span> on a MIRA MAF file, like this:
	</p><pre class="screen">
<code class="prompt">$</code> miraconvert maf -t sam <em class="replaceable"><code>INPUT</code></em>.maf <em class="replaceable"><code>OUTPUT</code></em></pre><p>
	  The above will create a file named <code class="filename">OUTPUT.sam</code>.
	</p><p>
	  Converting from SAM to a format which either <span class="command"><strong>mira</strong></span>
	  or <span class="command"><strong>miraconvert</strong></span> can understand takes a few
	  more steps. As neither tool currently reads SAM natively, you need
	  to go via the <span class="command"><strong>gap5</strong></span> editor of the Staden package:
	  convert the SAM via <span class="command"><strong>tg_index</strong></span> to a gap5 database,
	  load that database in gap5 and export it there to CAF.
	</p></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_res_filtering_of_results"></a>9.4. 
      Filtering results
    </h2></div></div></div><p>
      It is important to remember that, depending on assembly options, MIRA
      will also include very small contigs (with eventually very low coverage)
      made out of reads which were rejected from the "good" contigs for
      quality or other reasons. You probably do not want to have a look at
      this contig debris when finishing a genome unless you are really,
      really, really picky.
    </p><p>
      Many people prefer to just go on with what would be large
      contigs. Therefore, in de-novo assemblies, MIRA writes out separate
      files of what it thinks are "good", large contigs. In case you want to
      extract contigs differently, the <span class="command"><strong>miraconvert</strong></span> program
      from the MIRA package can selectively filter CAF or MAF files for
      contigs with a certain size, average coverage or number of reads.
    </p><p>
      The file <code class="filename">*_info_assembly.txt</code> in the info directory
      at the end of an assembly might give you first hints on what could be
      suitable filter parameters. As example, for "normal" assemblies
      (whatever this means), one could want to consider only contigs larger
      than 500 bases and which have at least one third of the average coverage
      of the N50 contigs.
    </p><p>
      Here's an example: In the "Large contigs" section, there's a "Coverage
      assessment" subsection. It looks a bit like this:
    </p><pre class="screen">
...
Coverage assessment:
--------------------
Max coverage (total): 43
Max coverage
Sanger: 0
454:    43
Solexa: 0
Solid:  0
Avg. total coverage (size &#8805; 5000): 22.30
Avg. coverage (contig size &#8805; 5000)
Sanger: 0.00
454:    22.05
Solexa: 0.00
Solid:  0.00
...</pre><p>
      This project was obviously a 454 only project, and the average coverage
      for it is ~22. This number was estimated by MIRA by taking only contigs
      of at least 5kb into account, which for sure left out everything which
      could be categorised as debris. Normally it's a pretty solid number.
    </p><p>
      Now, depending on how much time you want to invest performing some manual
      polishing, you should extract contigs which have at least the following
      fraction of the average coverage:
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	  2/3 if a quick and "good enough" is what you want and you don't want
	  to do some manual polishing. In this example, that would be around
	  14 or 15.
	</p></li><li class="listitem"><p>
	  1/2 if you want to have a "quick look" and eventually perform some
	  contig joins. In this example the number would be 11.
	</p></li><li class="listitem"><p>
	  1/3 if you want quite accurate and for sure not loose any possible
	  repeat. That would be 7 or 8 in this example.
	</p></li></ul></div><p>
      Example (useful with assemblies of Sanger data): extracting only contigs &#8805;
      1000 bases and with a minimum average coverage of 4 into FASTA format:
    </p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>miraconvert -x 1000 -y 4 <em class="replaceable"><code>sourcefile.maf targetfile.fasta</code></em></code></strong></pre><p>
      Example (useful with assemblies of 454 data): extracting only contigs
      &#8805; 500 bases into FASTA format:
    </p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>miraconvert -x 500 <em class="replaceable"><code>sourcefile.maf targetfile.fasta</code></em></code></strong></pre><p>
      Example (e.g. useful with Sanger/454 hybrid assemblies): extracting only
      contigs &#8805; 500 bases and with an average coverage &#8805; 15 reads into
      CAF format, then converting the reduced CAF into a Staden GAP4 project:
    </p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>miraconvert -x 500 -y 15 <em class="replaceable"><code>sourcefile.maf tmp.caf</code></em></code></strong>
<code class="prompt">$</code> <strong class="userinput"><code>caf2gap -project <em class="replaceable"><code>somename</code></em> -ace <em class="replaceable"><code>tmp.caf</code></em></code></strong></pre><p>
      Example (e.g. useful with Sanger/454 hybrid assemblies): extracting only
      contigs &#8805; 1000 bases and with &#8805; 10 reads from MAF into CAF format,
      then converting the reduced CAF into a Staden GAP4 project:
    </p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>miraconvert -x 500 -z 10 <em class="replaceable"><code>sourcefile.maf tmp.caf</code></em></code></strong>
<code class="prompt">$</code> <strong class="userinput"><code>caf2gap -project <em class="replaceable"><code>somename</code></em> -ace <em class="replaceable"><code>tmp.caf</code></em></code></strong></pre></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_res_places_of_importance_in_a_de_novo_assembly"></a>9.5. 
      Places of importance in a de-novo assembly
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_tags_set_by_mira"></a>9.5.1. 
	Tags set by MIRA
      </h3></div></div></div><p>
        MIRA sets a number of different tags in resulting assemblies. They can be set in reads
        (in which case they mostly end with a <span class="emphasis"><em>r</em></span>) or in the consensus.(then
        ending with a <span class="emphasis"><em>c</em></span>).
      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	  If you use the
	  Staden <span class="command"><strong>gap4</strong></span>, <span class="command"><strong>gap5</strong></span> or
	  <span class="command"><strong>consed</strong></span> assembly editor to tidy up the assembly, you
	  can directly jump to places of interest that MIRA marked for further
	  analysis by using the search functionality of these programs.
	</p><p>
	  However, you need to tell these programs that these tags exist. For
	  that you must change some configuration files. More information on
	  how to do this can be found in the
	  <code class="filename">support/README</code> file of the MIRA distribution.
	</p></td></tr></table></div><p>
	You should search for the following "consensus" tags for finding places of importance
	(in this order).
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    IUPc
	  </p></li><li class="listitem"><p>
	    UNSc
	  </p></li><li class="listitem"><p>
	    SRMc
	  </p></li><li class="listitem"><p>
	    WRMc
	    </p></li><li class="listitem"><p>
	    STMU (only hybrid assemblies)
	  </p></li><li class="listitem"><p>
	    MCVc (only when assembling different strains, i.e., mostly relevant for mapping assemblies)
	  </p></li><li class="listitem"><p>
	    SROc (only when assembling different strains, i.e., mostly relevant for mapping assemblies)
	  </p></li><li class="listitem"><p>
	    SAOc (only when assembling different strains, i.e., mostly relevant for mapping assemblies)
	  </p></li><li class="listitem"><p>
	    SIOc (only when assembling different strains, i.e., mostly relevant for mapping assemblies)
	  </p></li><li class="listitem"><p>
	    STMS (only hybrid assemblies)
	  </p></li></ul></div><p>
      </p><p>
	of lesser importance are the "read" versions of the tags above:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    UNSr
	  </p></li><li class="listitem"><p>
	    SRMr
	  </p></li><li class="listitem"><p>
	    WRMr
	  </p></li><li class="listitem"><p>
	    SROr (only when assembling different strains, i.e., mostly relevant for mapping assemblies)
	  </p></li><li class="listitem"><p>
	    SAOr (only when assembling different strains, i.e., mostly relevant for mapping assemblies)
	  </p></li><li class="listitem"><p>
	    SIOr (only when assembling different strains, i.e., mostly relevant for mapping assemblies)
	    </p></li></ul></div><p>
      </p><p>
	In normal assemblies (only one sequencing technology, just one
	strain), search for the IUPc, UNSc, SRMc and WRMc tags.
      </p><p>
	In hybrid assemblies, searching for the IUPc, UNSc, SRMc, WRMc, and
	STMU tags and correcting only those places will allow you to have a
	qualitatively good assembly in no time at all.
      </p><p>
	Columns with SRMr tags (SRM in <span class="bold"><strong>R</strong></span>eads)
	in an assembly without a SRMc tag at the same consensus position show
	where mira was able to resolve a repeat during the different passes of
	the assembly ... you don't need to look at these. SRMc and WRMc tags
	however mean that there may be unresolved trouble ahead, you should take a
	look at these.
      </p><p>
	Especially in mapping assemblies, columns with the MCVc, SROx, SIOx and SAOx tags are
	extremely helpful in finding places of interest. As they are only set if you
	gave strain information to MIRA, you should always do that.
      </p><p>
	For more information on tags set/used by MIRA and what they exactly mean, please look up the
	according section in the reference chapter.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_other_places_of_importance"></a>9.5.2. 
	Other places of importance
      </h3></div></div></div><p>
	The read coverage histogram as well as the template display of gap4
	will help you to spot other places of potential interest. Please consult the
	gap4 documentation.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_joining_contigs"></a>9.5.3. 
	Joining contigs
      </h3></div></div></div><p>
	I recommend to invest a couple of minutes (in the best case) to a few
	hours in joining contigs, especially if the uniform read distribution
	option of MIRA was used (but first filter for large contigs). This
	way, you will reduce the number of "false repeats" in improve the
	overall quality of your assembly.
      </p><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_res_joining_truerepeats"></a>9.5.3.1. 
	  Joining contigs at true repetitive sites
	</h4></div></div></div><p>
	  Joining contigs at repetitive sites of a genome is always a
	  difficult decision. There are, however, two rules which can help:
	  </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem">
	      If the sequencing was done without a paired-end library, don't join.
	    </li><li class="listitem">
	      If the sequencing was done with a paired-end library, but no
	      pair (or template) span the join site, don't join.
	    </li></ol></div><p>
	  </p><p>
	    The following screen shot shows a case where one should not join as
	    the finishing program (in this case <span class="command"><strong>gap4</strong></span>) warns
	    that no template (read-pair) span the join site:
	  </p><p>
	  </p><div class="figure"><a name="haf_danger_join_notok.png"></a><p class="title"><b>Figure 9.3. 
	      Join at a repetitive site which should not be performed due to
	      missing spanning templates.
	    </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/haf_danger_join_notok.png" width="100%" alt="Join at a repetitive site which should not be performed due to missing spanning templates."></td></tr></table></div></div></div><p><br class="figure-break">
	  </p><p>
	    The next screen shot shows a case where one should join as the
	    finishing program (in this case <span class="command"><strong>gap4</strong></span>) finds
	    templates spanning the join site and all of them are good:
	  </p><p>
	  </p><div class="figure"><a name="haf_danger_join_ok.png"></a><p class="title"><b>Figure 9.4. 
	      Join at a repetitive site which should be performed due to
	      spanning templates being good.
	    </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/haf_danger_join_ok.png" width="100%" alt="Join at a repetitive site which should be performed due to spanning templates being good."></td></tr></table></div></div></div><p><br class="figure-break">
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_res_joining_FALSErepeats"></a>9.5.3.2. 
	  Joining contigs at "wrongly discovered" repetitive sites
	</h4></div></div></div></div><p>
	Remember that MIRA takes a very cautious approach in contig building,
	and sometimes creates two contigs when it could have created
	one. Three main reasons can be the cause for this:
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    when using <span class="emphasis"><em>uniform read distribution</em></span>, some
	    non-repetitive areas may have generated so many more reads that
	    they start to look like repeats (so called pseudo-repeats). In
	    this case, reads that are above a given coverage are
	    <span class="emphasis"><em>shaved off</em></span> (see  [-AS:urdcm] and kept
	    in reserve to be used for another copy of that repeat ... which in
	    case of a non-repetitive region will of course never arrive. So at
	    the end of an assembly, these shaved-off reads will form short,
	    low coverage contig debris which can more or less be safely
	    ignored and sorted out via the filtering options ( [-x -y
	    -z]) of <span class="command"><strong>miraconvert</strong></span>.
	  </p><p>
	    Some 454 library construction protocols -- especially, but not
	    exclusively, for paired-end reads -- create pseudo-repeats quite
	    frequently. In this case, the pseudo-repeats are characterised by
	    several reads starting at exact the same position but which can
	    have different lengths. Should MIRA have separated these reads
	    into different contigs, these can be -- most of the time -- safely
	    joined. The following figure shows such a case:
	  </p><div class="figure"><a name="454_stacks_join.png"></a><p class="title"><b>Figure 9.5. 
	      Pseudo-repeat in 454 data due to sequencing artifacts
	    </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/454_stacks_join.png" width="100%" alt="Pseudo-repeat in 454 data due to sequencing artifacts"></td></tr></table></div></div></div><br class="figure-break"><p>
	    For Solexa data, a non-negligible GC bias has been reported in
	    genome assemblies since late 2009. In genomes with moderate to
	    high GC, this bias actually favours regions with lower
	    GC. Examples were observed where regions with an average GC of 10%
	    less than the rest of the genome had between two and four times
	    more reads than the rest of the genome, leading to false
	    "discovery" of duplicated genome regions.
	  </p></li><li class="listitem"><p>
	    when using unpaired data, the above described possibility of
	    having "too many" reads in a non-repetitive region can also lead
	    to a contig being separated into two contigs in the region of the
	    pseudo-repeat.
	  </p></li><li class="listitem"><p>
	    a number of reads (sometimes even just one) can contain "high
	    quality garbage", that is, nonsense bases which got - for some
	    reason or another - good quality values.  This garbage can be
	    distributed on a long stretch in a single read or concern just a
	    single base position across several reads.
	  </p><p>
	    While MIRA has some algorithms to deal with the disrupting effects
	    of reads like, the algorithms are not always 100% effective and
	    some might slip through the filters.
	  </p></li></ol></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_res_places_of_interest_in_a_mapping_assembly"></a>9.6. 
      Places of interest in a mapping assembly
    </h2></div></div></div><p>
      This section just give a short overview on the tags you might find
      interesting. For more information, especially on how to configure gap4
      or consed, please consult the <span class="emphasis"><em>mira usage</em></span> document
      and the <span class="emphasis"><em>mira</em></span> manual.
    </p><p>
      In file types that allow tags (CAF, MAF, ACE), SNPs and other
      interesting features will be marked by MIRA with a number of tags. The
      following sections give a brief overview. For a description of what
      the tags are (SROc, WRMc etc.), please read up the section "Tags used
      in the assembly by MIRA and EdIt" in the main manual.
    </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      Screen shots in this section are taken from the walk-through with
      Lenski data (see below).
    </td></tr></table></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_poi_where_are_snps?"></a>9.6.1. 
	Where are SNPs?
      </h3></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    the <span class="bold"><strong>SROc</strong></span> tag will point to most
	    SNPs. Should you assemble sequences of more than one strain (I
	    cannot really recommend such a strategy), you also might
	    encounter <span class="bold"><strong>SIOc</strong></span> and <span class="bold"><strong>SAOc</strong></span> tags.
	  </p><div class="figure"><a name="chap_sol::sxa_sroc_lenski1.png"></a><p class="title"><b>Figure 9.6. 
	      "SROc" tag showing a SNP position in a Solexa mapping
	      assembly.
	    </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_sroc_lenski1.png" width="100%" alt='"SROc" tag showing a SNP position in a Solexa mapping assembly.'></td></tr></table></div></div></div><br class="figure-break"><div class="figure"><a name="chap_sol::sxa_sroc_lenski2.png"></a><p class="title"><b>Figure 9.7. 
	      "SROc" tag showing a SNP/indel position in a Solexa mapping
	      assembly.
	    </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_sroc_lenski2.png" width="100%" alt='"SROc" tag showing a SNP/indel position in a Solexa mapping assembly.'></td></tr></table></div></div></div><br class="figure-break"></li><li class="listitem"><p>
	    the <span class="bold"><strong>WRMc</strong></span> tags might sometimes
	    point SNPs to indels of one or two bases.
	  </p></li></ul></div><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_poi_where_are_insertions_deletions_or_genome_rearrangements?"></a>9.6.2. 
	Where are insertions, deletions or genome re-arrangements?
      </h3></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    Large deletions: the <span class="bold"><strong>MCVc</strong></span> tags
	    point to deletions in the resequenced data, where no read is
	    covering the reference genome.
	  </p><div class="figure"><a name="chap_sol::sxa_mcvc_lenski.png"></a><p class="title"><b>Figure 9.8. 
	      "MCVc" tag (dark red stretch in figure) showing a genome
	      deletion in Solexa mapping assembly.
	    </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_mcvc_lenski.png" width="100%" alt='"MCVc" tag (dark red stretch in figure) showing a genome deletion in Solexa mapping assembly.'></td></tr></table></div></div></div><br class="figure-break"></li><li class="listitem"><p>
	    Insertions, small deletions and re-arrangements: these are
	    harder to spot. In unpaired data sets they can be found looking
	    at clusters of <span class="bold"><strong>SROc</strong></span>, <span class="bold"><strong>SRMc</strong></span>, <span class="bold"><strong>WRMc</strong></span>, and / or <span class="bold"><strong>UNSc</strong></span> tags.
	  </p><div class="figure"><a name="chap_sol::sxa_wrmcsrmc_hiding_lenski1.png"></a><p class="title"><b>Figure 9.9. 
	      An IS150 insertion hiding behind a WRMc and a SRMc tags
	    </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_wrmcsrmc_hiding_lenski1.png" width="100%" alt="An IS150 insertion hiding behind a WRMc and a SRMc tags"></td></tr></table></div></div></div><br class="figure-break"><p>
	    more massive occurrences of these tags lead to a rather colourful
	    display in finishing programs, which is why these clusters are
	    also sometimes called Xmas-trees.
	  </p><div class="figure"><a name="chap_sol::sxa_xmastree_lenski1.png"></a><p class="title"><b>Figure 9.10. 
	      A 16 base pair deletion leading to a SROc/UNsC xmas-tree
	    </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_xmastree_lenski1.png" width="100%" alt="A 16 base pair deletion leading to a SROc/UNsC xmas-tree"></td></tr></table></div></div></div><br class="figure-break"><div class="figure"><a name="chap_sol::sxa_xmastree_lenski2.png"></a><p class="title"><b>Figure 9.11. 
	      An IS186 insertion leading to a SROc/UNsC xmas-tree
	    </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_xmastree_lenski2.png" width="100%" alt="An IS186 insertion leading to a SROc/UNsC xmas-tree"></td></tr></table></div></div></div><br class="figure-break"><p>
	    In sets with paired-end data, post-processing software (or
	    alignment viewers) can use the read-pair information to guide
	    you to these sites (MIRA doesn't set tags at the moment).
	  </p></li></ul></div><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_poi_other_tags_of_interest"></a>9.6.3. 
	Other tags of interest
      </h3></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    the <span class="bold"><strong>UNSc</strong></span> tag points to areas
	    where the consensus algorithm had troubles choosing a base. This
	    happens in low coverage areas, at places of insertions (compared
	    to the reference genome) or sometimes also in places where
	    repeats with a few bases difference are present. Often enough,
	    these tags are in areas with problematic sequences for the
	    Solexa sequencing technology like, e.g., a
	    <code class="literal">GGCxG</code> or even <code class="literal">GGC</code> motif in
	    the reads.
	  </p></li><li class="listitem"><p>
	    the <span class="bold"><strong>SRMc</strong></span> tag points to places
	    where repeats with a few bases difference are present. Here too,
	    sequence problematic for the Solexa technology are likely to
	    have cause base calling errors and subsequently setting of this
	    tag.
	  </p></li></ul></div><p>
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_res_postprocessing_mapping_assemblies"></a>9.7. 
      Post-processing mapping assemblies
    </h2></div></div></div><p>
      This section is a bit terse, you should also read the chapter on
      <span class="emphasis"><em>working with results</em></span> of MIRA.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_pp_manual_cleanup"></a>9.7.1. 
	Manual cleanup and validation (optional)
      </h3></div></div></div><p>
	When working with resequencing data and a mapping assembly, I always
	load finished projects into an assembly editor and perform a quick
	cleanup of the results. SNP or small indels normally do not need
	cleanups, but every mapper will get larger indels mostly wrong, and
	MIRA is no exception to this.
      </p><p>
	For close relatives of the reference strain this doesn't take long as
	MIRA will have set tags (see section earlier in this document) at all
	sites you should have a look at. For example, very close mutant
	bacteria with just SNPs or simple deletions and no genome
	reorganisation, I usually clean up in 10 to 15 minutes. That gives the
	last boost to data quality and your users (biologists etc.) will thank
	you for that as it reduces their work in analysing the data (be it
	looking at data or performing wet-lab experiments).
      </p><p>
        The general workflow I use is to convert the CAF file to a gap4 or gap5
        database. Then, in gap4 or gap5, I
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    quickly search for the UNSc and WRMc tags and check whether they
	    could be real SNPs that were overseen by MIRA. In that case, I
	    manually set a SROc (or SIOc) tag in gap4 via hotkeys that were
	    defined to set these tags.
	  </p></li><li class="listitem"><p>
	    sometimes also quickly clean up reads that are causing trouble in
	    alignments and lead to wrong base calling. These can be found at
	    sites with UNSc tags, most of the time they have the 5' to 3'
	    <code class="literal">GGCxG</code> motif which can cause trouble to Solexa.
	  </p></li><li class="listitem"><p>
	    look at sites with deletions (tagged with MCVc) and look whether I
	    should clean up the borders of the deletion.
	  </p></li></ol></div><p>
	After this, I convert the gap4 or gap5 database back to CAF format.
	But beware: gap4 does not have the same consensus calling routines as
	MIRA and will have saved it's own consensus in the new CAF. In fact,
	gap4 performs rather badly in projects with multiple sequencing
	technologies. So I use miraconvert from the MIRA package to recall
	a good consensus (and save it in MAF as it's more compact and a lot
	faster in handling than CAF):
      </p><p>
	And from this MAF file I can then convert with miraconvert to any
	other format I or my users need: CAF, FASTA, ACE, WIG (for coverage
	analysis), SNP and coverage analysis (see below), HTML etc.pp.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_poi_comprehensive_snp_analysis_spreadsheet_tables_for_excel_or_oocalc"></a>9.7.2. 
	Comprehensive SNP analysis spreadsheet tables (for Excel or OOcalc)
      </h3></div></div></div><p>
	Biologists are not really interested in SNPs coordinates, and why
	should they? They're more interested where SNPs are, how good they
	are, which genes or other elements they hit, whether they have an
	effect on a protein sequence, whether they may be important etc. For
	organisms without intron/exon structure or splice variants, MIRA can
	generate pretty comprehensive tables and files if an annotated
	GenBank file was used as reference and strain information was given
	to MIRA during the assembly.
      </p><p>
	Well, MIRA does all that automatically for you if the reference
	sequence you gave was annotated.
      </p><p>
	For this, <span class="command"><strong>miraconvert</strong></span> should be used with the
	<span class="emphasis"><em>asnp</em></span> format as target and a MAF (or CAF) file as
	input:
      </p><pre class="screen"><code class="prompt">$</code> <strong class="userinput"><code>miraconvert -t asnp <em class="replaceable"><code>input.maf output</code></em></code></strong></pre><p>
	Note that it is strongly suggested to perform a quick manual cleanup
	of the assembly prior to this: for rare cases (mainly at site of
	small indels of one or two bases), MIRA will not tag SNPs with a SNP
	tag (SROc, SAOc or SIOc) but will be fooled into a tag denoting
	unsure positions (UNSc). This can be quickly corrected manually. See
	further down in this manual in the section on post-processing.
      </p><p>
	After conversion, you will have four files in the directory which
	you can all drag-and-drop into spreadsheet applications like
	OpenOffice Calc or Excel.
      </p><p>
	The files should be pretty self-explanatory, here's just a short overview:
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    <code class="filename">output_info_snplist.txt</code> is a simple list of
	    the SNPs, with their positions compared to the reference
	    sequence (in bases and map degrees on the genome) as well as the
	    GenBank features they hit.
	  </p></li><li class="listitem"><p>
	    <code class="filename">output_info_featureanalysis.txt</code> is a much
	    extended version of the list above. It puts the SNPs into
	    context of the features (proteins, genes, RNAs etc.) and gives a
	    nice list, SNP by SNP, what might cause bigger changes in
	    proteins.
	  </p></li><li class="listitem"><p>
	    <code class="filename">output_info_featuresummary.txt</code> looks at the
	    changes (SNPs, indels) from the other way round. It gives an
	    excellent overview which features (genes, proteins, RNAs,
	    intergenic regions) you should investigate.
	  </p><p>
	    There's one column (named 'interesting') which pretty much
	    summarises up everything you need into three categories: yes,
	    no, and perhaps. 'Yes' is set if indels were detected, an amino
	    acid changed, start or stop codon changed or for SNPs in
	    intergenic regions and RNAs. 'Perhaps' is set for SNPs in
	    proteins that change a codon, but not an amino acid (silent
	    SNPs). 'No' is set if no SNP is hitting a feature.
	  </p></li><li class="listitem"><p>
	    <code class="filename">output_info_featuresequences.txt</code> simply
	    gives the sequences of each feature of the reference sequence
	    and the resequenced strain.
	  </p></li></ol></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_poi_html_files_depicting_snp_positions_and_deletions"></a>9.7.3. 
	HTML files depicting SNP positions and deletions
      </h3></div></div></div><p>
	I've come to realise that people who don't handle data from NextGen
	sequencing technologies on a regular basis (e.g., many biologists)
	don't want to be bothered with learning to handle specialised
	programs to have a look at their resequenced strains. Be it because
	they don't have time to learn how to use a new program or because
	their desktop is not strong enough (CPU, memory) to handle the data
	sets.
      </p><p>
	Something even biologist know to operate are browsers. Therefore,
	miraconvert has the option to load a MAF (or CAF) file of a
	mapping assembly at output to HTML those areas which are interesting
	to biologists. It uses the tags SROc, SAOc, SIOc and MCVc and outputs
	the surrounding alignment of these areas together with a nice overview
	and links to jump from one position to the previous or next.
      </p><p>
	This is done with the '<code class="literal">-t hsnp</code>' option of
	miraconvert:
      </p><pre class="screen"><code class="prompt">$</code> <strong class="userinput"><code>miraconvert -t hsnp <em class="replaceable"><code>input.maf output</code></em></code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	I recommend doing this only if the resequenced strain is a very close
	relative to the reference genome, else the HTML gets pretty big. But
	for a couple of hundred SNPs it works great.
      </td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_poi_wig_files"></a>9.7.4. 
	WIG files depicting contig coverage or GC content
      </h3></div></div></div><p>
	<span class="command"><strong>miraconvert</strong></span> can also dump a coverage file in WIG
	format (using '<code class="literal">-t wig</code>') or a WIG file for GC
	content (using '<code class="literal">-t gcwig</code>'). This comes pretty handy
	for searching genome deletions or duplications in programs like the
	Affymetrix Integrated Genome Browser (IGB, see <a class="ulink" href="http://igb.bioviz.org/" target="_top">http://igb.bioviz.org/</a>) or when looking for foreign sequence
	in a genome.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_res_poi_tables_for_feature_coverage"></a>9.7.5. 
	Comprehensive spreadsheet tables for gene expression values / genome deletions &amp; duplications
      </h3></div></div></div><p>
	When having data mapped against a reference with annotations (either
	from GenBank formats or GFF3 formats),
	<span class="command"><strong>miraconvert</strong></span> can generate tables depicting
	either expression values (in RNASeq/EST data mappings) or probable
	genome multiplication and deletion factors (in genome mappings). For
	this to work, you must use a MAF or CAF file as input, specify
	<span class="emphasis"><em>fcov</em></span> as output format and the reference sequence
	must have had annotations during the mapping with MIRA.
      </p><p>TODO: add example</p><pre class="screen"><strong class="userinput"><code>miraconvert -t fcov <em class="replaceable"><code>mira_out.maf myfeaturetable</code></em></code></strong></pre></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_mutils"></a>Chapter 10. Utilities in the MIRA package</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_mutils_convpro">10.1. miraconvert</a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_mutils_cp_synopsis">10.1.1. 
	Synopsis
      </a></span></dt><dt><span class="sect2"><a href="#sect_mutils_cp_description">10.1.2. Description</a></span></dt><dt><span class="sect2"><a href="#sect_mutils_cp_options">10.1.3. Options</a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_mutils_cp_options_general">10.1.3.1. General options</a></span></dt><dt><span class="sect3"><a href="#sect_mutils_cp_options_contigs">10.1.3.2. Options for input containing contig data</a></span></dt></dl></dd><dt><span class="sect2"><a href="#sect_mutils_cp_examples">10.1.4. Examples</a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_mutils_bait">10.2. mirabait - a "grep" for kmers</a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_mutils_bait_synopsis">10.2.1. 
	Synopsis
      </a></span></dt><dt><span class="sect2"><a href="#sect_mutils_bait_description">10.2.2. Description</a></span></dt><dt><span class="sect2"><a href="#sect_mutils_bait_options">10.2.3. Options</a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_mutils_bait_mainoptions">10.2.3.1. Main options</a></span></dt><dt><span class="sect3"><a href="#sect_mutils_bait_filetypeoptions">10.2.3.2. File type options</a></span></dt><dt><span class="sect3"><a href="#sect_mutils_bait_outputdef">10.2.3.3. Output definition</a></span></dt></dl></dd><dt><span class="sect2"><a href="#sect_mutils_bait_examples">10.2.4. Usage examples</a></span></dt></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">Ninety percent of success is just growing up.
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_mutils_convpro"></a>10.1. miraconvert</h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_mutils_cp_synopsis"></a>10.1.1. 
	Synopsis
      </h3></div></div></div><div class="cmdsynopsis"><p><code class="command">miraconvert</code>  [options] {<em class="replaceable"><code>input_file</code></em>} {<em class="replaceable"><code>output_basename</code></em>}</p></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_mutils_cp_description"></a>10.1.2. Description</h3></div></div></div><p>
	<span class="command"><strong>miraconvert</strong></span> is a tool to convert, extract and
	sometimes recalculate all kinds of data related to sequence assembly
	files.
      </p><p>
	More specifically, <span class="command"><strong>miraconvert</strong></span> can
	</p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	      convert from multiple alignment files (CAF, MAF) to other multiple
	      alignment files (CAF, MAF, ACE, SAM), and -- if wished -- selecting
	      contigs by different criteria like name, length, coverage etc.
	    </p></li><li class="listitem"><p>
	      extract the consensus from multiple alignments in CAF and MAF format,
	      writing it to any supported output format (FASTA, FASTQ, plain text,
	      HTML, etc.) and -- if wished -- recalculating the consensus using
	      the MIRA consensus engine with MIRA parameters
	    </p></li><li class="listitem"><p>
	      extract read sequences (clipped or unclipped) from multiple
	      alignments and save to any supported format
	    </p></li><li class="listitem"><p>
	      Much more, need to document this.
	    </p></li></ol></div><p>
      </p><p>&#8230;</p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_mutils_cp_options"></a>10.1.3. Options</h3></div></div></div><p>&#8230;</p><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_mutils_cp_options_general"></a>10.1.3.1. General options</h4></div></div></div><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      <code class="option">-f
	      <em class="replaceable"><code>
		{ <code class="option">caf</code>  |   <code class="option">maf</code>  |   <code class="option">fasta</code>  |   <code class="option">fastq</code>  |   <code class="option">gbf</code>  |   <code class="option">phd</code>  |   <code class="option">fofnexp</code> }
	      </code></em>
	      </code>
	    </span></dt><dd><p>
		<span class="quote">&#8220;<span class="quote">From-type</span>&#8221;</span>, the format of the input file. CAF and MAF
		files can contain full assemblies and/or unassembled (single)
		sequences while the other formats contain only unassembled
		sequences.
	      </p></dd><dt><span class="term">
	      <code class="option">-t
	      <em class="replaceable"><code>
		{ <code class="option">ace</code>  |   <code class="option">asnp</code>  |   <code class="option">caf</code>  |   <code class="option">crlist</code>  |   <code class="option">cstats</code>  |   <code class="option">exp</code>  |   <code class="option">fasta</code>  |   <code class="option">fastq</code>  |   <code class="option">fcov</code>  |   <code class="option">gbf</code>  |   <code class="option">gff3</code>  |   <code class="option">hsnp</code>  |   <code class="option">html</code>  |   <code class="option">maf</code>  |   <code class="option">phd</code>  |   <code class="option">sam</code>  |   <code class="option">samnbb</code>  |   <code class="option">text</code>  |   <code class="option">tcs</code>  |   <code class="option">wig</code> }
	      </code></em>
	      </code>
	      <code class="option">[ -t &#8230; ]</code>
	    </span></dt><dd><p>
		<span class="quote">&#8220;<span class="quote">To-type</span>&#8221;</span>, the format of the output file. Multiple
		mentions of  [-t] are allowed, in which case
		<span class="command"><strong>miraconvert</strong></span> will convert to multiple types.
	      </p></dd><dt><span class="term"><code class="option">-a</code></span></dt><dd><p>
		Append. Results of conversion are appended to existing files instead of overwriting them.
	      </p></dd><dt><span class="term"><code class="option">-A</code></span></dt><dd><p>
		Do not adjust sequence case.
	      </p><p>
		When reading formats which define clipping points (like CAF,
		MAF or EXP), and saving to formats which do not have clipping
		information, miraconvert normally adjusts the case of read
		sequences: lower case for clipped parts, upper case for
		unclipped parts of reads.  Use -A if you do not want this. See
		also -C.
	      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Applies only to files/formats which do not contain contigs.
	      </td></tr></table></div></dd><dt><span class="term"><code class="option">-b</code></span></dt><dd><p>
		Blind data. Replace all bases in all reads / contigs with a 'c'.
	      </p></dd><dt><span class="term"><code class="option">-C</code></span></dt><dd><p>
		Hard clip reads. When the input is a format which contains clipping
		points in sequences and the requested output consists of sequences
		of reads, only the unclipped parts of sequences will be saved as
		results.
	      </p></dd><dt><span class="term"><code class="option">-d</code></span></dt><dd><p>
		Delete gap only columns. When output is contigs: delete
		columns that are entirely gaps (can occur after having deleted
		reads during editing in gap4, consed or other). When output is
		reads: delete gaps in reads.
	      </p></dd><dt><span class="term"><code class="option">-F</code></span></dt><dd><p>
		Filter read groups to different files. Works only for input
		files containing readgroups, i.e., CAF or MAF. 3 (or 4) files
		are generated: one or two for paired, one for unpaired and one
		for debris reads. Reads in paired file are interlaced by
		default, use -F twice to create separate files.
	      </p></dd><dt><span class="term"><code class="option">-m</code></span></dt><dd><p>
		Make contigs. Encase single reads as contig singlets into a CAF/MAF
		file.
	      </p></dd><dt><span class="term"><code class="option">-n <em class="replaceable"><code>namefile</code></em></code></span></dt><dd><p>
		Name select. Only contigs or reads are selected for output which
		name appears in
		<code class="filename">namefile</code>. <code class="filename">namefile</code> is a
		simple text file having one name entry per line.
	      </p></dd><dt><span class="term"><code class="option">-i</code></span></dt><dd><p>
		When -n is used, inverts the selection.
	      </p></dd><dt><span class="term"><code class="option">-o <em class="replaceable"><code>offset</code></em></code></span></dt><dd><p>
		Offset of quality values in FASTQ files. Only valid if -f is FASTQ.
	      </p></dd><dt><span class="term"><code class="option">-P <em class="replaceable"><code>MIRA-PARAMETERSTRING</code></em></code></span></dt><dd><p>
		Additional MIRA parameters. Allows to initialise the underlying MIRA
		routines with specific parameters. A use case can be, e.g., to
		recalculate a consensus of an assembly in a slightly different way
		(see also [-r]) than the one which is stored in assembly
		files. Example: to tell the consensus algorithm to use a minimum
		number of reads per group for 454 reads, use: "454_SETTINGS -CO:mrpg=4".
	      </p><p>
		Consult the MIRA reference manual for a full list of MIRA
		parameters.
	      </p></dd><dt><span class="term"><code class="option">-q quality_value</code></span></dt><dd><p>
		When loading read data from files where sequence and quality
		are split in several files (e.g. FASTA with .fasta and
		.fasta.qual files), do not stop if the quality values for a
		read are missing but set them to be the quality_value given.
	      </p></dd><dt><span class="term"><code class="option">-R <em class="replaceable"><code>namestring</code></em></code></span></dt><dd><p>
		Rename contigs/singlets/reads with given name string to which
		a counter is added.
	      </p><p>
		Known bug: will create duplicate names if input (CAF or
		MAF) contains contigs/singlets as well as free reads, i.e.
		reads not in contigs nor singlets.
	      </p></dd><dt><span class="term"><code class="option">-S <em class="replaceable"><code>namescheme</code></em></code></span></dt><dd><p>
		Naming scheme for renaming reads, important for
		paired-ends. Only 'solexa' is supported at the moment.
	      </p></dd><dt><span class="term"><code class="option">-Y <em class="replaceable"><code>integer</code></em></code></span></dt><dd><p>
		Yield. Defines the maximum number of (clipped/padded) bases to
		convert. When used on reads: output will contain first reads
		of file where length of clipped bases totals at least -Y.
		When used on contigs: output will contain first contigs of
		file where length of padded contigs totals at least -Y.
	      </p></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_mutils_cp_options_contigs"></a>10.1.3.2. Options for input containing contig data</h4></div></div></div><p>
	  The following switches will work only if the input file contains
	  contigs (i.e., CAF or MAF with contig data). Though infrequent, note
	  that both CAF and MAF can contain single reads only.
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term"><code class="option">-M</code></span></dt><dd><p>
		Do not extract contigs (or their consensus), but the reads
		they are composed of.
	      </p></dd><dt><span class="term"><code class="option">-N <em class="replaceable"><code>namefile</code></em></code></span></dt><dd><p>
		Name select, sorted. Only contigs/reads are selected for
		output which name appears in
		<code class="filename">namefile</code>. Regardless of the order of
		contigs/reads in the input, the output is sorted according to
		the appearance of names in
		<code class="filename">namefile</code>. <code class="filename">namefile</code>
		is a simple text file having one name entry per line.
	      </p><p>
		Note that for this function to work, all contigs/reads are
		loaded into memory which may be straining your RAM for larger
		projects.
	      </p></dd><dt><span class="term">
	      <code class="option">-r
	      <em class="replaceable"><code>
		{ <code class="option">c</code>  |   <code class="option">C</code>  |   <code class="option">q</code>  |   <code class="option">f</code> }
	      </code></em>
	      </code>
	    </span></dt><dd><p>
		Recalculate consensus and / or consensus quality values and / or
		SNP feature tags of an assembly. This feature is useful in case
		third party programs create own consensus sequences without
		handling different sequencing technologies (e.g. the combination
		of <span class="command"><strong>gap4</strong></span> and <span class="command"><strong>caf2gap</strong></span>) or
		when the CAF/MAF files do not contain consensus sequences at
		all.
	      </p><div class="variablelist"><dl class="variablelist"><dt><span class="term"><code class="option">c</code></span></dt><dd>
		    recalculate consensus &amp; consensus qualities using IUPAC where necessary
		  </dd><dt><span class="term"><code class="option">C</code></span></dt><dd>
		    recalculate consensus &amp; consensus qualities forcing ACGT calls and without IUPAC codes
		  </dd><dt><span class="term"><code class="option">q</code></span></dt><dd>
		    recalculate consensus quality values only
		  </dd><dt><span class="term"><code class="option">f</code></span></dt><dd>
		    recalculate SNP features
		  </dd></dl></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		Only the last of cCq is relevant, 'f' works as a switch and can be
		combined with the others (e.g. <span class="quote">&#8220;<span class="quote">-r Cf</span>&#8221;</span>).
	      </td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
		If the CAF/MAF contains reads from multiple strains, recalculation
		of consensus &amp; consensus qualities is forced, you can just
		influence whether IUPACs are used or not. This is due to the fact
		that CAF/MAF do not provide facilities to store consensus
		sequences from multiple strains.
	      </td></tr></table></div></dd><dt><span class="term"><code class="option">-s</code></span></dt><dd><p>
		Split. Split output into single files, one file per
		contig. Files are named according to name of contig.
	      </p></dd><dt><span class="term"><code class="option">-u</code></span></dt><dd><p>
		fillUp strain genomes. In assemblies made of multiple strains,
		holes in the consensus of a strain (bases 'N' or '@') can be
		filled up with the consensus of the other strains. Takes effect
		only when '-r' is active.
	      </p></dd><dt><span class="term"><code class="option">-Q <em class="replaceable"><code>quality_value</code></em></code></span></dt><dd><p>
		Defines minimum quality a consensus base of a strain
		must have, consensus bases below this will be set to 'N'.
		Only used when -r is active.
	      </p></dd><dt><span class="term"><code class="option">-V <em class="replaceable"><code>coverage_value</code></em></code></span></dt><dd><p>
		Defines minimum coverage a consensus base of a strain must
		have, consensus bases below this coverage will be set to 'N'.
		Only used when -r is active.
	      </p></dd><dt><span class="term"><code class="option">-v</code></span></dt><dd><p>
		Print version number and exit.
	      </p></dd><dt><span class="term"><code class="option">-x <em class="replaceable"><code>length</code></em></code></span></dt><dd><p>
		Minimum length a contig (in full assemblies) or read (in single
		sequence files) must have. All contigs / reads with a
		length less than this value are discarded. Default: 0 (=switched
		off).
	      </p><p>
		Note: this is of course not applied to reads in contigs! Contigs passing
		the [-x] length criterion and stored as complete
		assembly (CAF, MAF, ACE, etc.) still contain all their reads.
	      </p></dd><dt><span class="term"><code class="option">-X <em class="replaceable"><code>length</code></em></code></span></dt><dd><p>
		Similar to [-x], but applies only to clipped reads
		(input file format must have clipping points set to be
		effective).
	      </p></dd><dt><span class="term"><code class="option">-y <em class="replaceable"><code>contig_coverage</code></em></code></span></dt><dd><p>
		Minimum average contig coverage. Contigs with an average
		coverage less than this value are discarded.
	      </p></dd><dt><span class="term"><code class="option">-z <em class="replaceable"><code>min_reads</code></em></code></span></dt><dd><p>
		Minimum number of reads in contig. Contigs with less
		reads than this value are discarded.
	      </p></dd><dt><span class="term"><code class="option">-l <em class="replaceable"><code>line_length</code></em></code></span></dt><dd><p>
		On output of assemblies as text or HTML: number of bases shown in
		one alignment line. Default: 60.
	      </p></dd><dt><span class="term"><code class="option">-c <em class="replaceable"><code>endgap_character</code></em></code></span></dt><dd><p>
		On output of assemblies as text or HTML: character used to pad
		endgaps. Default: ' ' (a blank)
	      </p></dd></dl></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_mutils_cp_examples"></a>10.1.4. Examples</h3></div></div></div><p>
	In the following examples, the CAF and MAF files used are expected to
	contain full assembly data like the files created by MIRA during an
	assembly or by the gap2caf program. CAF and MAF could be used
	interchangeably in these examples, depending on which format currently
	is available. In general though, MAF is faster to process and smaller
	on disk.
      </p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	    Simple conversion: a MIRA MAF file to a SAM file
	  </span></dt><dd><pre class="screen">
<strong class="userinput"><code>miraconvert source.maf destination.sam</code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
		Previous versions of miraconvert had a slightly different
		syntax, which however is still supported:
	      </p><pre class="screen">
<strong class="userinput"><code>miraconvert source.maf destination.sam</code></strong></pre></td></tr></table></div></dd><dt><span class="term">
	    Simple conversion: the consensus of an assembly to FASTA, at the
	    same time coverage data for contigs to WIG and furthermore
	    translate the CAF to ACE:
	  </span></dt><dd><pre class="screen">
<strong class="userinput"><code>miraconvert source.caf destination.fasta wig ace</code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
		Previous versions of miraconvert had a slightly different
		syntax, which however is still supported:
	      </p><pre class="screen">
<strong class="userinput"><code>miraconvert -f caf -t fasta -t wig -t ace source.caf destination</code></strong></pre></td></tr></table></div></dd><dt><span class="term">
	    Filtering an assembly for contigs of length &#8805;2000 and an
	    average coverage &#8805; 10, while translating from MAF to CAF:
	  </span></dt><dd><pre class="screen">
<strong class="userinput"><code>miraconvert -x 2000 -y 10 source.caf destination.caf</code></strong></pre></dd><dt><span class="term">
	    Filtering a FASTQ file for reads &#8805; 55 base pairs, rename the
	    selected reads with a string starting <span class="quote">&#8220;<span class="quote">newname</span>&#8221;</span> and
	    save them back to FASTQ. Note how  [-t fastq] was left out
	    as the default behaviour of <span class="command"><strong>miraconvert</strong></span> is
	    to use the same "to" type as the input type ( [-f]).
	  </span></dt><dd><pre class="screen">
<strong class="userinput"><code>miraconvert -x 55 -R newname source.fastq destination.fastq</code></strong></pre></dd><dt><span class="term">
	    Filtering and reordering contigs of an assembly according to external contig name list.
	  </span></dt><dd><p>
	      This example will fetch the contigs named bchoc_c14, ...3, ...5
	      and ...13 and save the result in exactly that order to a new
	      file:
	    </p><pre class="screen">
<code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>ls -l</code></strong>
-rw-r--r-- 1 bach users  231698898 2007-10-21 15:16 bchoc_out.caf
-rw-r--r-- 1 bach users         38 2007-10-21 15:16 contigs.lst
<code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>cat contigs.lst</code></strong>
bchoc_c14
bchoc_c3
bchoc_c5
bchoc_c13
<code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>miraconvert -N contigs.lst bchoc_out.caf myfilteredresult.caf</code></strong>
[...]
<code class="prompt">arcadia:/path/to/myProject$</code> <strong class="userinput"><code>ls -l</code></strong>
-rw-r--r-- 1 bach users  231698898 2007-10-21 15:16 bchoc_out.caf
-rw-r--r-- 1 bach users         38 2007-10-21 15:16 contigs.lst
-rw-r--r-- 1 bach users     828726 2007-10-21 15:24 myfilteredresult.caf</pre></dd></dl></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_mutils_bait"></a>10.2. mirabait - a "grep" for kmers</h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_mutils_bait_synopsis"></a>10.2.1. 
	Synopsis
      </h3></div></div></div><div class="cmdsynopsis"><p><code class="command">mirabait</code>  [options] {-b <em class="replaceable"><code>baitfile</code></em> [-b ...] | -L <em class="replaceable"><code>file</code></em>} [-p <em class="replaceable"><code>file1 file2</code></em> | -P <em class="replaceable"><code>file3</code></em>]*
	 [<em class="replaceable"><code>file4 ...</code></em>]</p></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	The above command line format appeared only in MIRA 4.9.0!
      </td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_mutils_bait_description"></a>10.2.2. Description</h3></div></div></div><p>
	<span class="command"><strong>mirabait</strong></span> selects reads from a read collection which
	are partly similar or equal to sequences defined as target
	baits. Similarity is defined by finding a user-adjustable number of
	common k-mers (sequences of k consecutive bases) which are the same in
	the bait sequences and the screened sequences to be selected, either in forward
	or reverse complement direction.
      </p><p>
	When used on paired files (-p or -P), selects read pairs where at least
	one read matches.
      </p><p>
	One can use <span class="command"><strong>mirabait</strong></span> to do targeted assembly by
	fishing out reads belonging to a gene and just assemble these; or to
	clean out rRNA sequences from data sets; or to fish out and
	iteratively reconstruct mitochondria from metagenomic data; or, or, or
	... whenever one has to take in or take out subsets of reads, this
	tool should come in quite handy.
      </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	The search performed is exact, that is, sequences selected are
	guaranteed to have the required number of matching k-mers to the bait
	sequences while sequences not selected are guaranteed not have these.
      </td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_mutils_bait_options"></a>10.2.3. Options</h3></div></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_mutils_bait_mainoptions"></a>10.2.3.1. Main options</h4></div></div></div><div class="variablelist"><dl class="variablelist"><dt><span class="term"><code class="option">-b <em class="replaceable"><code>file</code></em></code></span></dt><dd><p>
		A file containing sequences to be used as bait. The file can
		be in any of the following types: FASTQ, FASTA, GenBank (.gbf,
		.gbk, .gbff), CAF, MAF or Staden EXP. Should the file
		extension not be any of the previously recognised types,
		mirabait will try to load a bait file as FASTA file, use -F to
		override this.
	      </p><p>
		Using multiple -b for loading bait sequences from multiple
		files is allowed.
	      </p></dd><dt><span class="term"><code class="option">-p <em class="replaceable"><code>file_1 file_2</code></em></code></span></dt><dd><p>
		Instructs to load sequences to be baited from files
		<code class="filename">file_1</code> and
		<code class="filename">file_2</code>. The sequences are treated as
		pairs, where a read in one file is paired with a read in the
		second file. The files can be in any of the following types:
		FASTQ, FASTA, GenBank (.gbf, .gbk, .gbff), CAF, MAF or Staden
		EXP. Should the file extension not be any of the previously
		recognised types, mirabait will try to load files as FASTQ
		files, use -f to override this.
	      </p><p>
		Using multiple -p for baiting sequences from multiple files is
		allowed.
	      </p></dd><dt><span class="term"><code class="option">-P <em class="replaceable"><code>file</code></em></code></span></dt><dd><p>
		Instructs to load sequences to be baited from file
		<code class="filename">file</code>. The sequences are treated as pairs,
		where a read in the file is immediately followed by its paired
		read. The file can be in any of the following types: FASTQ,
		FASTA, GenBank (.gbf, .gbk, .gbff), CAF, MAF or Staden
		EXP. Should the file extension not be any of the previously
		recognised types, mirabait will try to load it as FASTQ file,
		use -f to override this.
	      </p><p>
		Using multiple -P for baiting sequences from multiple files is
		allowed.
	      </p></dd><dt><span class="term"><code class="option">-L</code></span></dt><dd><p>
		Do not load bait sequences from a file with sequences, but
		instead treat the baitfilename as file name of a valid
		<span class="command"><strong>mirabait</strong></span> hash statistics file and load it
		from disk.
	      </p><p>
		This feature enables one to reuse baits from earlier runs
		without having to wait for the recomputation of hash
		statistics.
	      </p></dd><dt><span class="term"><code class="option">-k <em class="replaceable"><code>kmer-length</code></em></code></span></dt><dd><p>
		k-mer, length of bait in bases (&#8804;256, default=31)
	      </p></dd><dt><span class="term"><code class="option">-n <em class="replaceable"><code>integer</code></em></code></span></dt><dd><p>
		Default value: 1.
	      </p><p>
		If the integer given is &gt; 0: minimum number of kmers needed
		for a sequence to be selected.
	      </p><p>
		If the integer given is &#8804; 0: maximum number of missed kmers
		allowed over sequence length for a sequence to be selected.
	      </p></dd><dt><span class="term"><code class="option">-i</code></span></dt><dd><p>
		Inverse selection: selects only sequence that do not meet the
		-k and -n criteria.
	      </p></dd><dt><span class="term"><code class="option">-I</code></span></dt><dd><p>
		Filters and writes sequences which hit to one file and
		sequences which do not hit to a second file.
	      </p></dd><dt><span class="term"><code class="option">-r</code></span></dt><dd><p>
		Does not check for hits in reverse complement direction.
	      </p></dd><dt><span class="term"><code class="option">-c</code></span></dt><dd><p>
		Normally, mirabait will change the case of the sequences it
		loads to denote kmers which hit a bait in upper case and kmers
		which did not hit a bait in lower case. Using -c switches off
		this behaviour.
	      </p></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_mutils_bait_filetypeoptions"></a>10.2.3.2. File type options</h4></div></div></div><p>
	  Normally, mirabait recognises the file types according to the file
	  extension. In cases you need to force a certain file type because
	  the file extension is non-standard, use the following options:
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	      <code class="option">-f
	      <em class="replaceable"><code>
		{ <code class="option">caf</code>  |   <code class="option">maf</code>  |   <code class="option">fasta</code>  |   <code class="option">fastq</code>  |   <code class="option">gbf</code>  |   <code class="option">phd</code> }
	      </code></em>
	      </code>
	    </span></dt><dd><p>
		<span class="quote">&#8220;<span class="quote">From-type</span>&#8221;</span>, the format of the input
		file. Default: fastq.
	      </p><p>
		Normally, mirabait will determine the format of an input file
		by looking at the postfix of the file name. The -f option can
		be used to override this mechanism or to load data from files
		where no filename postfix is available.
	      </p></dd><dt><span class="term">
	      <code class="option">-F
	      <em class="replaceable"><code>
		{ <code class="option">caf</code>  |   <code class="option">maf</code>  |   <code class="option">fasta</code>  |   <code class="option">fastq</code>  |   <code class="option">gbf</code>  |   <code class="option">phd</code> }
	      </code></em>
	      </code>
	    </span></dt><dd><p>
		Like -f, but for the bait sequences.
	      </p></dd><dt><span class="term">
	      <code class="option">-t
	      <em class="replaceable"><code>
		{ <code class="option">caf</code>  |   <code class="option">maf</code>  |   <code class="option">fasta</code>  |   <code class="option">fastq</code>  |   <code class="option">txt</code> }
	      </code></em>
	      </code>
	    </span></dt><dd><p>
		<span class="quote">&#8220;<span class="quote">To-type</span>&#8221;</span>, the format of the output
		file. Default: format of the input.
	      </p><p>
		Normally, mirabait will an output file in the format defined
		by the input file(s). The -t option can be used to change the
		output to a desired format.
	      </p></dd></dl></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_mutils_bait_outputdef"></a>10.2.3.3. Output definition</h4></div></div></div><p>
	  Normally, mirabait writes separate result files (named
	  <code class="filename">bait_match_*</code> and
	  <code class="filename">bait_miss_*</code>) for each input to the current
	  directory. For changing this behaviour, use these options:
	</p><div class="variablelist"><dl class="variablelist"><dt><span class="term"><code class="option">-N <em class="replaceable"><code>name</code></em></code></span></dt><dd><p>
		Change the file prefix <code class="filename">bait</code> to
		<code class="filename">name</code>. Has no effect if -o/-O is used and
		targets are not directories.
	      </p></dd><dt><span class="term"><code class="option">-o <em class="replaceable"><code>path</code></em></code></span></dt><dd><p>
		Save sequences matching a bait to
		<code class="filename">path</code>. If <code class="filename">path</code> is a
		directory, write separate files into this directory. If not,
		combine all matching sequences from the input file(s) into a
		single file specified by the path.
	      </p></dd><dt><span class="term"><code class="option">-O <em class="replaceable"><code>path</code></em></code></span></dt><dd><p>
		Like -o, but for sequences not matching.
	      </p></dd></dl></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_mutils_bait_examples"></a>10.2.4. Usage examples</h3></div></div></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	The examples below, together with the manual above, should be enough to get
	you going. If there's a typical use case you are missing, feel free to
	suggest it on the MIRA talk mailing list.
      </td></tr></table></div><p>Baiting unpaired sequences, bait sequences in FASTA, sequences in FASTQ:</p><pre class="screen"><strong class="userinput"><code>mirabait -b b.fasta file.fastq</code></strong></pre><p>Same as above, but baits in two files (FASTA and GenBank):</p><pre class="screen"><strong class="userinput"><code>mirabait -b b1.fasta -b b2.gbk file.fastq</code></strong></pre><p>Baiting paired sequences, read pairs are in two files:</p><pre class="screen"><strong class="userinput"><code>mirabait -b b.fasta -p file_1.fastq file_2.fastq</code></strong></pre><p>Baiting paired sequences, pairs are interleaved in one file:</p><pre class="screen"><strong class="userinput"><code>mirabait -b b.fasta -P file.fastq</code></strong></pre><p>Like above, but selecting sequences which do not match the baits:</p><pre class="screen"><strong class="userinput"><code>mirabait -i -b b.fasta -P file.fastq</code></strong></pre><p>Baiting paired sequences (<code class="filename">file_1.fastq</code>, <code class="filename">file_2.fastq</code> and <code class="filename">file3.fastq</code>) and unpaired sequences (<code class="filename">file4.fastq</code>), all at once and different file types:</p><pre class="screen"><strong class="userinput"><code>mirabait -b b.fasta -p file_1.fastq file_2.fastq -P file3.fasta file4.caf</code></strong></pre><p>Like above, but writing sequences matching baits and sequences not matching baits to different files:</p><pre class="screen"><strong class="userinput"><code>mirabait -I -b b.fasta -p file_1.fastq file_2.fastq -P file3.fasta file4.caf</code></strong></pre><p>Change bait criterion to need 10 kmers of size 27:</p><pre class="screen"><strong class="userinput"><code>mirabait -k 27 -n 10 -b b.fasta file.fastq</code></strong></pre><p>
	Change bait criterion to baiting only reads which have all kmers
	present in the bait:
      </p><pre class="screen"><strong class="userinput"><code>mirabait -n 0 -b b.fasta file.fastq</code></strong></pre><p>
	Change bait criterion to baiting all reads having almost all kmers present in the
	bait, but allowing for up to 40 kmers not in the bait:
      </p><pre class="screen"><strong class="userinput"><code>mirabait -n -40 -b b.fasta file.fastq</code></strong></pre><p>Force bait sequences to load as FASTA, force sequences to be baited to be loaded as FASTQ:</p><pre class="screen"><strong class="userinput"><code>mirabait -F fasta -f fastq -b b.dat file.dat</code></strong></pre><p>Write result files to directory <code class="filename">/dev/shm/</code>:</p><pre class="screen"><strong class="userinput"><code>mirabait -o /dev/shm/ -b b.fasta -p file_1.fastq file_2.fastq</code></strong></pre><p>Merge all result files containing sequences hitting baits to file <code class="filename">/dev/shm/match</code>:</p><pre class="screen"><strong class="userinput"><code>mirabait -o /dev/shm/match -b b.fasta -p file_1.fastq file_2.fastq</code></strong></pre><p>Like above, but also merge all result files containing sequences not hitting baits to file <code class="filename">/dev/shm/nomatch</code>:</p><pre class="screen"><strong class="userinput"><code>mirabait -o /dev/shm/match -O /dev/shm/nomatch -b b.fasta -p file_1.fastq file_2.fastq</code></strong></pre></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_hard"></a>Chapter 11. Assembly of <span class="emphasis"><em>hard</em></span> genome or EST / RNASeq projects</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_hard_getting_mean_data_assembled">11.1. 
      Getting 'mean' genomes or EST / RNASeq data sets assembled
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_hard_for_the_impatient">11.1.1. 
	For the impatient
      </a></span></dt><dt><span class="sect2"><a href="#sect_hard_introduction_to_masking">11.1.2. 
	Introduction to 'masking'
      </a></span></dt><dt><span class="sect2"><a href="#sect_hard_how_does_nasty_repeat_masking_work">11.1.3. 
	How does 'nasty repeat' masking work?
      </a></span></dt><dt><span class="sect2"><a href="#sect_hard_selecting_a_nasty_repeat_ratio">11.1.4. 
	Selecting a "nasty repeat ratio"
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_hard_how_MIRA_tags_different_repeat_levels">11.2. 
      How MIRA tags different repeat levels
    </a></span></dt><dt><span class="sect1"><a href="#sect_hard_the_readrepeats_info_file">11.3. 
      The readrepeats info file
    </a></span></dt><dt><span class="sect1"><a href="#sect_hard_pipeline_to_find_worst_contaminants_or_repeats_in_sequencing_data">11.4. 
      Pipeline to find worst contaminants or repeats in sequencing data
    </a></span></dt><dt><span class="sect1"><a href="#sect_hard_examples_for_kmer_statistics">11.5. 
      Examples for kmer statistics
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_hard_caveat:_sk:kms">11.5.1. 
	Caveat: -SK:kmer_size
      </a></span></dt><dt><span class="sect2"><a href="#sect_hard_sanger_sequencing_a_simple_bacterium">11.5.2. 
	Sanger sequencing, a simple bacterium
      </a></span></dt><dt><span class="sect2"><a href="#sect_hard_454_sequencing_a_somewhat_more_complex_bacterium">11.5.3. 
	454 Sequencing, a somewhat more complex bacterium
      </a></span></dt><dt><span class="sect2"><a href="#sect_hard_solexa_sequencing_ecoli_mg1655">11.5.4. 
	Solexa sequencing, E.coli MG1655
      </a></span></dt><dt><span class="sect2"><a href="#sect_hard_need_examples_for_eukaryotes">11.5.5. 
	(NEED EXAMPLES FOR EUKARYOTES)
      </a></span></dt><dt><span class="sect2"><a href="#sect_hard_need_examples_for_pathological_cases">11.5.6. 
	(NEED EXAMPLES FOR PATHOLOGICAL CASES)
      </a></span></dt></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">If it were easy, it would have been done already.
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_hard_getting_mean_data_assembled"></a>11.1. 
      Getting 'mean' genomes or EST / RNASeq data sets assembled
    </h2></div></div></div><p>
    </p><p>
      For some EST data sets you might want to assemble, MIRA will take too
      long or the available memory will not be sufficient. For genomes this
      can be the case for eukaryotes, plants, but also for some bacteria which
      contain high number of (pro-)phages, plasmids or engineered operons. For
      EST data sets, this concerns all projects with non-normalised libraries.
    </p><p>
      This guide is intended to get you through these problematic genomes. It
      is (cannot be) exhaustive, but it should get you going.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_hard_for_the_impatient"></a>11.1.1. 
	For the impatient
      </h3></div></div></div><p>
	For bacteria with nasty repeats, try first
	[--hirep_something]. This will increase runtime and memory
	requirements, but helps to get this sorted out. If the data for lower
	eukaryotes leads to runtime and memory explosion, try either
	 [--hirep_good] or, for desperate cases,
	 [--hirep_something].
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_hard_introduction_to_masking"></a>11.1.2. 
	Introduction to 'masking'
      </h3></div></div></div><p>
	The SKIM phase (all-against-all comparison) will report almost every potential
	hit to be checked with Smith-Waterman further downstream in the MIRA assembly
	process. While this is absolutely no problem for most bacteria, some genomes
	(eukaryotes, plants, some bacteria) have so many closely related sequences
	(repeats) that the data structures needed to take up all information might get
	much larger than your available memory. In those cases, your only chance to
	still get an assembly is to tell the assembler it should disregard extremely
	repetitive features of your genome.
      </p><p>
	There is, in most cases, one problem: one doesn't know beforehand which parts
	of the genome are extremely repetitive. But MIRA can help you here as it
	produces most of the needed information during assembly and you just need to
	choose a threshold from where on MIRA won't care about repetitive matches.
      </p><p>
	The key to this are the three fail-safe command line parameters which will mask
	"nasty" repeats from the quick overlap finder (SKIM): [-KS:mnr] and
	 [-KS:nrr] respectively  [-KS:nrc]. I'll come back
	to  [-SK:kms] later as it also plays a role in this.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_hard_how_does_nasty_repeat_masking_work"></a>11.1.3. 
	How does 'nasty repeat' masking work?
      </h3></div></div></div><p>
      </p><p>
	If switched on [-KS:mnr=yes], MIRA will use k-mer statistics to
	find repetitive stretches. K-mers are nucleotide stretches of length k. In a
	perfectly sequenced genome without any sequencing error and without sequencing
	bias, the k-mer frequency can be used to assess how many times a given
	nucleotide stretch is present in the genome: if a specific k-mer is present as
	many times as the average frequency of all k-mers, it is a reasonable
	assumption to estimate that the specific k-mer is not part of a repeat (at
	least not in this genome).
      </p><p>
	Following the same path of thinking, if a specific k-mer frequency is now two
	times higher than the average of all k-mers, one would assume that this
	specific k-mer is part of a repeat which occurs exactly two times in the
	genome. For 3x k-mer frequency, a repeat is present three times. Etc.pp. MIRA
	will merge information on single k-mers frequency into larger 'repeat'
	stretches and tag these stretches accordingly.
      </p><p>
	Of course, low-complexity nucleotide stretches (like poly-A in eukaryotes),
	sequencing errors in reads and non-uniform distribution of reads in a
	sequencing project will weaken the initial assumption that a k-mer frequency
	is representative for repeat status. But even then the k-mer frequency model
	works quite well and will give a pretty good overall picture: most repeats
	will be tagged as such.
      </p><p>
	Note that the parts of reads tagged as "nasty repeat" will not get masked per
	se, the sequence will still be present. The stretches dubbed repetitive will
	get the "MNRr" tag. They will still be used in Smith-Waterman overlaps and
	will generate a correct consensus if included in an alignment, but they will
	not be used as seed.
      </p><p>
	Some reads will invariably end up being completely repetitive. These
	will not be assembled into contigs as MIRA will not see overlaps as
	they'll be completely masked away. These reads will end up as
	debris. However, note that MIRA is pretty good at discerning 100%
	matching repeats from repeats which are not 100% matching: if there's
	a single base with which repeats can be discerned from each other,
	MIRA will find this base and use the k-mers covering that base to find
	overlaps.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_hard_selecting_a_nasty_repeat_ratio"></a>11.1.4. 
	Selecting a "nasty repeat ratio"
      </h3></div></div></div><p>
      </p><p>
	The ratio from which on the MIRA kmer statistics algorithm won't
	report matches is set via [-KS:nrr]. E.g.,
	using  [-KS:nrr=10] will hide all k-mers which occur at a
	frequency 10 times (or more) higher than the median of all k-mers.
      </p><p>
	The nastiness of a repeat is difficult to judge, but starting with 10 copies
	in a genome, things can get complicated. At 20 copies, you'll have some
	troubles for sure.
      </p><p>
	The standard values of <span class="emphasis"><em>10</em></span> for
	the  [-KS:nrr] parameter is a pretty good 'standard' value
	which can be tried for an assembly before trying to optimise it via
	studying the kmer statistics calculated by MIRA. For the later, please
	read the section 'Examples for kmer statistics' further down in this
	guide.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_hard_how_MIRA_tags_different_repeat_levels"></a>11.2. 
      How MIRA tags different repeat levels
    </h2></div></div></div><p>
      During SKIM phase, MIRA will assign frequency information to each and every
      k-mer in all reads of a sequencing project, giving them different
      status. Additionally, tags are set in the reads so that one can
      assess reads in assembly editors that understand tags (like gap4,
      gap5, consed etc.). The following tags are used:
    </p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	  HAF2
	</span></dt><dd><p> coverage below average ( default: &lt; 0.5 times average)
	  </p></dd><dt><span class="term">
	  HAF3
	</span></dt><dd><p> coverage is at average ( default: &#8805; 0.5 times average and &#8804; 1.5 times average)
	  </p></dd><dt><span class="term">
	  HAF4
	</span></dt><dd><p> coverage above average ( default: &gt; 1.5 times average and &lt; 2 times average)
	  </p></dd><dt><span class="term">
	  HAF5
	</span></dt><dd><p> probably repeat ( default: &#8805; 2 times average and &lt; 5 times average)
	  </p></dd><dt><span class="term">
	  HAF6
	</span></dt><dd><p> 'crazy' repeat ( default: &gt; 5 times average)
	  </p></dd><dt><span class="term">
	  MNRr
	</span></dt><dd><p> stretches which were masked away by [-KS:<em class="replaceable"><code>mnr=yes</code></em>]
	  being more repetitive than deduced
	  by  [-KS:<em class="replaceable"><code>nrr=...</code></em>] or given via  [-KS:<em class="replaceable"><code>nrc=...</code></em>].
	  </p></dd></dl></div><p>
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_hard_the_readrepeats_info_file"></a>11.3. 
      The readrepeats info file
    </h2></div></div></div><p>
      If [-KS:mnr=yes] is used, MIRA will write an additional file into the
      info directory:
      <code class="filename">&lt;projectname&gt;_info_readrepeats.lst</code>
    </p><p>
      The "readrepeats" file makes it possible to try and find out what makes
      sequencing data nasty. It's a key-value-value file with the name of the
      sequence as "key" and then the type of repeat (HAF2 - HAF7 and MNRr) and
      the repeat sequence as "values". "Nasty" in this case means
      <span class="emphasis"><em>everything which was masked via
      [-KS:mnr=yes]</em></span>.
    </p><p>
      The file looks like this:
    </p><pre class="screen">
read1     HAF5   GCTTCGGCTTCGGCTTCGGCTTCGGCTTCGGCTTCGGCTTCGGCTTCGGCT ...
read2     HAF7   CCGAAGCCGAAGCCGAAGCCGAAGCCGAAGCCGAAGCCGAAGCCGAAGC ...
read2     MNRr   AAAAAAAAAAAAAAAAAAAAAAAAAAAA ...
read3     HAF6   GCTTCGGCTTCGGCTTCGGCTTCGGCTTCGGCTTCGGCTTCGGCTTCGGCT ...
...
etc.
    </pre><p>
      That is, each line consists of the read name where a stretch of
      repetitive sequences was found, then the MIRA repeat categorisation
      level (HAF2 to HAF7 and MNRr) and then the stretch of bases which is
      seen to be repetitive.
    </p><p>
      Note that reads can have several disjunct repeat stretches in a single
      read, hence they can occur more than one time in the file as shown with
      <span class="emphasis"><em>read2</em></span> in the example above.
    </p><p>
      One will need to search some databases with the "nasty" sequences and find
      vector sequences, adaptor sequences or even human sequences in bacterial or
      plant genomes ... or vice versa as this type of contamination happens quite
      easily with data from new sequencing technologies. After a while one gets a
      feeling what constitutes the largest part of the problem and one can start to
      think of taking countermeasures like filtering, clipping, masking etc.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_hard_pipeline_to_find_worst_contaminants_or_repeats_in_sequencing_data"></a>11.4. 
      Pipeline to find worst contaminants or repeats in sequencing data
    </h2></div></div></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	In case you are not familiar with UNIX pipes, now would be a good time
	to read an introductory text on how this wonderful system works. You
	might want to start with a short introductory article at Wikipedia:
	<a class="ulink" href="http://en.wikipedia.org/wiki/Pipeline_%28Unix%29" target="_top">http://en.wikipedia.org/wiki/Pipeline_%28Unix%29</a>
      </p><p>
	In a nutshell: instead of output to files, a pipe directs the output
	of one program as input to another program.
      </p></td></tr></table></div><p>
      There's one very simple trick to find out whether your data contains
      some kind of sequencing vector or adaptor contamination which I use. it
      makes use of the read repeat file discussed above.
    </p><p>
      The following example shows this exemplarily on a 454 data where the
      sequencing provider used some special adaptor in the wet lab but somehow
      forgot to tell the Roche pre-processing software about it, so that a
      very large fraction of reads in the SFF file had unclipped adaptor
      sequence in it (which of course wreaks havoc with assembly programs):
    </p><pre class="screen"><code class="prompt">arcadia:$</code> <strong class="userinput"><code>grep MNRr <em class="replaceable"><code>badproject</code></em>_info_readrepeats.lst | cut -f 3| sort | uniq -c |sort -g -r | head -15</code></strong>
    504 ACCACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    501 CAACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    489 GGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    483 GCCACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    475 AATACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    442 GATACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    429 CGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    424 TTGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    393 ACTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    379 CAGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    363 ATTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    343 CATACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    334 GTTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    328 AACACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    324 GGTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC</pre><p>
      You probably see a sequence pattern
      CTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC in the above screens hot. Before
      going into details of what you are actually seeing, here's the
      explanation how this pipeline works:
    </p><div class="variablelist"><dl class="variablelist"><dt><span class="term">
	  grep MNRr <em class="replaceable"><code>badproject</code></em>_info_readrepeats.lst
	</span></dt><dd><p>
	    From the file with the information on repeats, grab all the lines
	    containing repetitive sequence which MIRA categorised as 'nasty'
	    via the 'MNRr' tag. The result looks a bit like this (first 15
	    lines shown):</p><pre class="screen">C6E3C7T12GKN35  MNRr    GCGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12JLIBM  MNRr    TTCACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12HQOM1  MNRr    CAGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12G52II  MNRr    CAGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12JRMPO  MNRr    TCTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12H1A8V  MNRr    GCGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12H34Z7  MNRr    AAACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12H4HGC  MNRr    GGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12FNA1N  MNRr    AATACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12F074V  MNRr    CTTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12I1GYO  MNRr    CAACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12I53C8  MNRr    CACACTCGTATAGTGACACGCAACAGGGG
C6E3C7T12I4V6V  MNRr    ATCACTCGTATAGTGACACGCAACAGGGG
C6E3C7T12H5R00  MNRr    TCTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
C6E3C7T12IBA5E  MNRr    AATACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
...</pre><p>
	  </p></dd><dt><span class="term">
	  cut -f 3
	</span></dt><dd><p>
	    We're just interested in the sequence now, which is in the third
	    column. The above 'cut' command takes care of this. The resulting
	    output may look like this (only first 15 lines shown):
	  </p><pre class="screen">GCGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
TTCACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
CAGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
CAGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
TCTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
GCGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
AAACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
GGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
AATACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
CTTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
CAACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
CACACTCGTATAGTGACACGCAACAGGGG
ATCACTCGTATAGTGACACGCAACAGGGG
TCTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
AATACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
...</pre></dd><dt><span class="term">
	  sort
	</span></dt><dd><p>
	    Simply sort all sequences. The output may look like this now (only first 15 line shown):</p><pre class="screen">
AAACTCGTATAGTGACACGCA
AAACTCGTATAGTGACACGCAACAGG
AAACTCGTATAGTGACACGCAACAGGG
AAACTCGTATAGTGACACGCAACAGGGG
AAACTCGTATAGTGACACGCAACAGGGG
AAACTCGTATAGTGACACGCAACAGGGG
AAACTCGTATAGTGACACGCAACAGGGG
AAACTCGTATAGTGACACGCAACAGGGG
AAACTCGTATAGTGACACGCAACAGGGGAT
AAACTCGTATAGTGACACGCAACAGGGGATA
AAACTCGTATAGTGACACGCAACAGGGGATA
AAACTCGTATAGTGACACGCAACAGGGGATA
AAACTCGTATAGTGACACGCAACAGGGGATA
AAACTCGTATAGTGACACGCAACAGGGGATA
AAACTCGTATAGTGACACGCAACAGGGGATA
...</pre><p>
	  </p></dd><dt><span class="term">
	  uniq -c
	</span></dt><dd><p>
	    This command counts how often a line repeats itself in a file. As
	    we previously sorted the whole file by sequence, it effectively
	    counts how often a certain sequence has been tagged as MNRr. The
	    output consists of a tab delimited format in two columns: the
	    first column contains the number of times a given line (sequence
	    in our case) was seen, the second column contains the line
	    (sequence) itself. An exemplarily output looks like this (only first 15 lines shown):
	  </p><pre class="screen">      1 AAACTCGTATAGTGACACGCA
      1 AAACTCGTATAGTGACACGCAACAGG
      1 AAACTCGTATAGTGACACGCAACAGGG
      5 AAACTCGTATAGTGACACGCAACAGGGG
      1 AAACTCGTATAGTGACACGCAACAGGGGAT
     13 AAACTCGTATAGTGACACGCAACAGGGGATA
      6 AAACTCGTATAGTGACACGCAACAGGGGATAGAC
      4 AAACTCGTATAGTGACACGCAACAGGGGATAGACAA
      9 AAACTCGTATAGTGACACGCAACAGGGGATAGACAAGGC
      3 AAACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCA
    257 AAACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
      1 AACACTCGTATAGTGACACGCAAC
      2 AACACTCGTATAGTGACACGCAACAGGG
     23 AACACTCGTATAGTGACACGCAACAGGGG
      6 AACACTCGTATAGTGACACGCAACAGGGGATA
...</pre></dd><dt><span class="term">
	  sort -g -r
	</span></dt><dd><p>
	    We now sort the output of the previous uniq-counting command by
	    asking 'sort' to perform a numerical sort (via '-g') and
	    additionally sort in reverse order (via '-r') so that we get the
	    sequences encountered most often at the top of the output. And
	    that one looks exactly like shown previously:
	  </p><pre class="screen">
    504 ACCACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    501 CAACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    489 GGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    483 GCCACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    475 AATACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    442 GATACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    429 CGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    424 TTGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    393 ACTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    379 CAGACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    363 ATTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    343 CATACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    334 GTTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    328 AACACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
    324 GGTACTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC
...</pre></dd></dl></div><p>
      So, what is this ominous CTCGTATAGTGACACGCAACAGGGGATAGACAAGGCAC you are
      seeing? To make it short: a modified 454 B-adaptor with an additional MID sequence.
    </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	These adaptor sequences have absolutely no reason to exist in your
	data, none! Go back to your sequencing provider and ask them to have a look
	at their pipeline as they should have had it set up in a way that you
	do not see these things anymore. Yes, due to sequencing errors,
	sometimes some adaptor or sequencing vectors remnants will stay in
	your sequencing data, but that is no problem as MIRA is capable of
	handling that very well.
      </p><p>
	But having much more than 0.1% to 0.5% of your sequence containing
	these is a sure sign that someone goofed somewhere ... and it's very
	probably not your fault.
      </p></td></tr></table></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_hard_examples_for_kmer_statistics"></a>11.5. 
      Examples for kmer statistics
    </h2></div></div></div><p>
      Selecting the right ratio so that an assembly fits into your memory is not
      straight forward. But MIRA can help you a bit: during assembly, some frequency
      statistics are printed out (they'll probably end up in some info file in later
      releases). Search for the term "Kmer statistics" in the information printed
      out by MIRA (this happens quite early in the process)
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_hard_caveat:_sk:kms"></a>11.5.1. 
	Caveat: -SK:kmer_size
      </h3></div></div></div><p>
	Some explanation how kmer size affects the statistics and why it
	should be chosen &gt;=16 for [-KS:mnr]
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_hard_sanger_sequencing_a_simple_bacterium"></a>11.5.2. 
	Sanger sequencing, a simple bacterium
      </h3></div></div></div><p>
	This example is taken from a pretty standard bacterium where Sanger
	sequencing was used:
      </p><pre class="screen">
Kmer statistics:
=========================================================
Measured avg. coverage: 15

Deduced thresholds:
-------------------
Min normal cov: 7
Max normal cov: 23
Repeat cov: 29
Crazy cov: 120
Mask cov: 150

Repeat ratio histogram:
-----------------------
0       475191
1       5832419
2       181994
3       6052
4       4454
5       972
6       4
7       8
14      2
16      10
=========================================================
      </pre><p>
	The above can be interpreted like this: the expected coverage of the genome is
	15x. Starting with an estimated kmer frequency of 29, MIRA will treat a k-mer
	as 'repetitive'. As shown in the histogram, the overall picture of this
	project is pretty healthy:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    only a small fraction of k-mers have a repeat level of '0' (these would be
	    k-mers in regions with quite low coverage or k-mers containing sequencing
	    errors)
	  </p></li><li class="listitem"><p>
	    the vast majority of k-mers have a repeat level of 1 (so that's non-
	    repetitive coverage)
	  </p></li><li class="listitem"><p>
	    there is a small fraction of k-mers with repeat level of 2-10
	  </p></li><li class="listitem"><p>
	    there are almost no k-mers with a repeat level &gt;10
	  </p></li></ul></div><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_hard_454_sequencing_a_somewhat_more_complex_bacterium"></a>11.5.3. 
	454 Sequencing, a somewhat more complex bacterium
      </h3></div></div></div><p>
	Here's in comparison a profile for a more complicated bacterium (454
	sequencing):
      </p><pre class="screen">
Kmer statistics:
=========================================================
Measured avg. coverage: 20

Deduced thresholds:
-------------------
Min normal cov: 10
Max normal cov: 30
Repeat cov: 38
Crazy cov: 160
Mask cov: 0

Repeat ratio histogram:
-----------------------
0       8292273
1       6178063
2       692642
3       55390
4       10471
5       6326
6       5568
7       3850
8       2472
9       708
10      464
11      270
12      140
13      136
14      116
15      64
16      54
17      54
18      52
19      50
20      58
21      36
22      40
23      26
24      46
25      42
26      44
27      32
28      38
29      44
30      42
31      62
32      116
33      76
34      80
35      82
36      142
37      100
38      120
39      94
40      196
41      172
42      228
43      226
44      214
45      164
46      168
47      122
48      116
49      98
50      38
51      56
52      22
53      14
54      8
55      2
56      2
57      4
87      2
89      6
90      2
92      2
93      2
1177    2
1181    2
=========================================================
      </pre><p>
	The difference to the first bacterium shown is pretty striking:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    first, the k-mers in repeat level 0 (below average) is higher than
	    the k-mers of level 1! This points to a higher number of
	    sequencing errors in the 454 reads than in the Sanger project
	    shown previously. Or at a more uneven distribution of reads (but
	    not in this special case).
	  </p></li><li class="listitem"><p>
	    second, the repeat level histogram does not trail of at a repeat
	    frequency of 10 or 15, but it has a long tail up to the fifties, even having
	    a local maximum at 42. This points to a small part of the genome being
	    heavily repetitive ... or to (a) plasmid(s) in high copy numbers.
	  </p></li></ul></div><p>
      </p><p>
	Should MIRA ever have problems with this genome, switch on the nasty repeat
	masking and use a level of 15 as cutoff. In this case, 15 is OK to start with
	as a) it's a bacterium, it can't be that hard and b) the frequencies above
	level 5 are in the low thousands and not in the tens of thousands.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_hard_solexa_sequencing_ecoli_mg1655"></a>11.5.4. 
	Solexa sequencing, E.coli MG1655
      </h3></div></div></div><p>
      </p><pre class="screen">
Kmer statistics:
=========================================================
Measured avg. coverage: 23

Deduced thresholds:
-------------------
Min normal cov: 11
Max normal cov: 35
Repeat cov: 44
Crazy cov: 184
Mask cov: 0

Repeat ratio histogram:
-----------------------
0       1365693
1       8627974
2       157220
3       11086
4       4990
5       3512
6       3922
7       4904
8       3100
9       1106
10      868
11      788
12      400
13      186
14      28
15      10
16      12
17      4
18      4
19      2
20      14
21      8
25      2
26      8
27      2
28      4
30      2
31      2
36      4
37      6
39      4
40      2
45      2
46      8
47      14
48      8
49      4
50      2
53      2
56      6
59      4
62      2
63      2
67      2
68      2
70      2
73      4
75      2
77      4
=========================================================
      </pre><p>
	This kmer statistics shows that MG1655 is pretty boring (from a
	repetitive point of view). One might expect a few repeats but nothing
	fancy: The repeats are actually the rRNA and sRNA stretches in the
	genome plus some intergenic regions.
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    the k-mers number in repeat level 0 (below average) is
	    considerably lower than the level 1, so the Solexa sequencing
	    quality is pretty good respectively there shouldn't be too many
	    low coverage areas.
	  </p></li><li class="listitem"><p>
	    the histogram tail shows some faint traces of possibly highly repetitive
	    k-mers, but these are false positive matches due to some standard Solexa
	    base-calling weaknesses of earlier pipelines like, e.g., adding poly-A,
	    poly-T or sometimes poly-C and poly-G tails to reads when spots in the
	    images were faint and the base calls of bad quality
	  </p></li></ul></div><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_hard_need_examples_for_eukaryotes"></a>11.5.5. 
	(NEED EXAMPLES FOR EUKARYOTES)
      </h3></div></div></div><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_hard_need_examples_for_pathological_cases"></a>11.5.6. 
	(NEED EXAMPLES FOR PATHOLOGICAL CASES)
      </h3></div></div></div><p>
	Vector contamination etc.
      </p></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_seqtechdesc"></a>Chapter 12. Description of sequencing technologies</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_std_intro">12.1. 
      Introduction
    </a></span></dt><dt><span class="sect1"><a href="#sect_std_sxa">12.2. 
      Illumina (formerly Solexa)
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_std_sxa_caveats_for_illumina">12.2.1. 
	Caveats for Illumina data
      </a></span></dt><dt><span class="sect2"><a href="#sect_std_sxa_highlights">12.2.2. 
	Illumina highlights
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_std_sxa_highlights_quality">12.2.2.1. 
	  Quality
	</a></span></dt></dl></dd><dt><span class="sect2"><a href="#sect_std_std_sxa_lowlights">12.2.3. 
	Lowlights
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_std_sxa_lowlights_longhomopolymers">12.2.3.1. 
	  Long homopolymers
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_sxa_lowlights_GGCxG_motif">12.2.3.2. 
	  The GGCxG and GGC motifs
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_sxa_lowlights_chimericreads">12.2.3.3. 
	  Chimeric reads
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_sxa_lowlights_samplemix">12.2.3.4. 
	  Sample barcode misidentification
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_sxa_lowlights_nextera">12.2.3.5. 
	  Nextera library prep
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_sxa_lowlights_gcbias">12.2.3.6. 
	  Strong GC bias in some Solexa data (2nd half 2009 until advent of TruSeq kit at end of 2010)
	</a></span></dt></dl></dd></dl></dd><dt><span class="sect1"><a href="#sect_std_iontor">12.3. 
      Ion Torrent
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_std_iontor_hpindels">12.3.1. 
	Homopolymer insertions / deletions
      </a></span></dt><dt><span class="sect2"><a href="#sect_std_iontor_seqdirdepindels">12.3.2. 
	Sequencing direction dependent insertions / deletions
      </a></span></dt><dt><span class="sect2"><a href="#sect_std_iontor_covvariance">12.3.3. 
	Coverage variance
      </a></span></dt><dt><span class="sect2"><a href="#sect_std_iontor_gcbias">12.3.4. 
	GC bias
      </a></span></dt><dt><span class="sect2"><a href="#sect_std_iontor_other_sources_of_error">12.3.5. 
	Other sources of error
      </a></span></dt><dt><span class="sect2"><a href="#sect_std_iontor_where_to_find_further_information">12.3.6. 
	Where to find further information
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_std_pacbio">12.4. 
      Pacific BioSciences
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_std_pb_highlights">12.4.1. 
	Highlights
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_std_pb_hl_length">12.4.1.1. 
	  Sequence lengths
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_pb_hl_gcbias">12.4.1.2. 
	  GC bias
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_pb_hl_acccorrected">12.4.1.3. 
	  Accuracy of corrected reads
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_pb_hl_qualassemblies">12.4.1.4. 
	  Assemblies of corrected reads
	</a></span></dt></dl></dd><dt><span class="sect2"><a href="#sect_std_pb_lowlights">12.4.2. 
	Lowlights
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_std_pb_ll_namingconfusion">12.4.2.1. 
	  Naming confusion
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_pb_ll_revseq">12.4.2.2. 
	  Forward / reverse chimeric sequences
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_pb_ll_rawreadaccuracy">12.4.2.3. 
	  Accuracy of uncorrected subreads
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_pb_ll_cpu">12.4.2.4. 
	  Immense need for CPU power
	</a></span></dt><dt><span class="sect3"><a href="#sect_std_pb_ll_dnaprep">12.4.2.5. 
	  Increased quality requirements for clean DNA sample prep
	</a></span></dt></dl></dd></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">Opinions are like chili powder - best used in moderation.</span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_std_intro"></a>12.1. 
      Introduction
    </h2></div></div></div><p>
      <span class="bold"><strong>Note:</strong></span> This section contains things I've
      seen in the past and simply jotted down. These may be fundamentally
      correct or correct only under circumstances or not correct at all with
      your data. You may have different observations.
    </p><p>
      ...
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_std_sxa"></a>12.2. 
      Illumina (formerly Solexa)
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_std_sxa_caveats_for_illumina"></a>12.2.1. 
	Caveats for Illumina data
      </h3></div></div></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	  Even if you can get bacteria sequenced with ridiculously high coverage
	  like 500x or 1000x, this amount of data is simply not needed. Even
	  more important - though counterintuitive - is the fact that due to
	  non-random sequence dependent sequencing errors, a too high coverage
	  may even make the assembly worse.
	</p><p>
	  Another rule of thumb: when having more than enough data, reduce the
	  data set so as to have an average coverage of approximately 100x. In
	  some rare cases (high GC content), perhaps 120x to 150x, but certainly
	  not more.
	</p></td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	When reducing a data set, do <span class="bold"><strong>NOT</strong></span>,
	under no circumstances not, try fancy selection of reads by some
	arbitrary quality or length criteria. This will introduce a terrible
	bias in your assembly due to non-random sequence-dependent sequencing
	errors and non-random sequence dependent base quality assignment. More
	on this in the next section.
      </td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_std_sxa_highlights"></a>12.2.2. 
	Illumina highlights
      </h3></div></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_sxa_highlights_quality"></a>12.2.2.1. 
	  Quality
	</h4></div></div></div><p>
	  For current HiSeq 100bp reads I get - after MIRA clipping - about 90
	  to 95% reads matching to a reference without a single error. MiSeq
	  250bp reads contain a couple more errors, but nothing to be alarmed
	  off.
	</p><p>
	  In short: Illumina is currently <span class="emphasis"><em>the</em></span> technology
	  to use if you want high quality reads.
	</p></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_std_std_sxa_lowlights"></a>12.2.3. 
	Lowlights
      </h3></div></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_sxa_lowlights_longhomopolymers"></a>12.2.3.1. 
	  Long homopolymers
	</h4></div></div></div><p>
	  Long homopolymers (stretches of identical bases in reads) can be a
	  slight problem for Solexa. However, it must be noted that this is a
	  problem of all sequencing technologies on the market so far (Sanger,
	  Solexa, 454). Furthermore, the problem in much less pronounced in
	  Solexa than in 454 data: in Solexa, first problem appear may appear
	  in stretches of 9 to 10 bases, in Ion Torrent a stretch of 3 to 4
	  bases may already start being problematic in some cases.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_sxa_lowlights_GGCxG_motif"></a>12.2.3.2. 
	  The GGCxG and GGC motifs
	</h4></div></div></div><p>
	  <code class="literal">GGCxG</code> or even <code class="literal">GGC</code> motif in the
	  5' to 3' direction of reads. This one is particularly annoying and
	  it took me quite a while to circumvent in MIRA the problems it
	  causes.
	</p><p>
	  Simply put: at some places in a genome, base calling after a
	  <code class="literal">GGCxG</code> or <code class="literal">GGC</code> motif is
	  particularly error prone, the number of reads without errors
	  declines markedly. Repeated <code class="literal">GGC</code> motifs worsen
	  the situation. The following screen shots of a mapping assembly
	  illustrate this.
	</p><p>
	  The first example is a the <code class="literal">GGCxG</code> motif (in form
	  of a <code class="literal">GGCTG</code>) occurring in approximately one third
	  of the reads at the shown position. Note that all but one read
	  with this problem are in the same (plus) direction.
	</p><div class="figure"><a name="sxa_unsc_ggcxg2_lenski.png"></a><p class="title"><b>Figure 12.1. 
	    The Solexa GGCxG problem.
	  </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_unsc_ggcxg2_lenski.png" width="100%" alt="The Solexa GGCxG problem."></td></tr></table></div></div></div><br class="figure-break"><p>
	  The next two screen shots show the <code class="literal">GGC</code>, once for
	  forward direction and one with reverse direction reads:
	</p><div class="figure"><a name="sxa_unsc_ggc1_lenski.png"></a><p class="title"><b>Figure 12.2. 
	    The Solexa GGC problem, forward example
	  </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_unsc_ggc1_lenski.png" width="100%" alt="The Solexa GGC problem, forward example"></td></tr></table></div></div></div><br class="figure-break"><div class="figure"><a name="sxa_unsc_ggc4_lenski.png"></a><p class="title"><b>Figure 12.3. 
	    The Solexa GGC problem, reverse example
	  </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_unsc_ggc4_lenski.png" width="100%" alt="The Solexa GGC problem, reverse example"></td></tr></table></div></div></div><br class="figure-break"><p>
	  Places in the genome that have <code class="literal">GGCGGC.....GCCGCC</code>
	  (a motif, perhaps even repeated, then some bases and then an
	  inverted motif) almost always have very, very low number of good
	  reads. Especially when the motif is <code class="literal">GGCxG</code>.
	</p><p>
	  Things get especially difficult when these motifs occur at sites
	  where users may have a genuine interest. The following example is a
	  screen shot from the Lenski data (see walk-through below) where a
	  simple mapping reveals an anomaly which -- in reality -- is an IS
	  insertion (see <a class="ulink" href="http://www.nature.com/nature/journal/v461/n7268/fig_tab/nature08480_F1.html" target="_top">http://www.nature.com/nature/journal/v461/n7268/fig_tab/nature08480_F1.html</a>)
	  but could also look like a <code class="literal">GGCxG</code> motif in forward
	  direction (<code class="literal">GGCCG</code>) and at the same time a
	  <code class="literal">GGC</code> motif in reverse direction:
	</p><div class="figure"><a name="sxa_xmastree_lenski2.png"></a><p class="title"><b>Figure 12.4. 
	    A genuine place of interest almost masked by the
	    <code class="literal">GGCxG</code> problem.
	  </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_xmastree_lenski2.png" width="100%" alt="A genuine place of interest almost masked by the GGCxG problem."></td></tr></table></div></div></div><br class="figure-break"></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_sxa_lowlights_chimericreads"></a>12.2.3.3. 
	  Chimeric reads
	</h4></div></div></div><p>
	  I did not realise chimeric reads were a problem with Illumina data
	  until Fall 2014 when I got reads &gt; 100bp for extremely well
	  charactersided bacteria ... and because MIRA since ever used data
	  cleaning methods which worked very well on either short reads &#8804;
	  100bp or when chimeras occurred at a very low frequency.
	</p><p>
	  Chimeras are are artefacts reads from library preparation which
	  contain parts of the sequence of interest which do not belong
	  together. E.g., in DNA from a bacterial genome, there may be one
	  read of 100 bp where the first 40 bp come from the genome position
	  at 100kb and the last 60 bp come from a position at 1300kb ... more
	  than one megabase apart.
	</p><p>
	  There is not much literature regarding chimeric sequences in
	  Illumina data: most of it deals with 16S or amplicon sequencing
	  where I always thought <span class="emphasis"><em>"that does not apply to my data
	  sets."</em></span> Well, tough luck ... it does. After some searching I
	  found some papers which report quite varying levels depending on the
	  protocols used.  Oyola et al. report between 0.24% and 2.3% of
	  chimeras (<span class="emphasis"><em>Optimizing illumina next-generation sequencing
	  library preparation for extremely at-biased genomes</em></span>; BMC
	  Genomics 2012, 13:1; doi:10.1186/1471-2164-13-1; <a class="ulink" href="http://www.biomedcentral.com/1471-2164/13/1" target="_top">http://www.biomedcentral.com/1471-2164/13/1</a>). Apparently, a
	  paper from researchers at the Sanger Centre reported up to 5%
	  chimeric reads (Bronner et al., <span class="emphasis"><em>Improved Protocols for
	  Illumina Sequencing</em></span>; Current Protocols in Human Genetics
	  18:18.2:18.2.1&#8211;18.2.42; DOI: 10.1002/0471142905.hg1802s80; <a class="ulink" href="http://onlinelibrary.wiley.com/doi/10.1002/0471142905.hg1802s80/abstract" target="_top">http://onlinelibrary.wiley.com/doi/10.1002/0471142905.hg1802s80/abstract</a>
	  via <a class="ulink" href="http://www.sagescience.com/blog/sanger-reports-improved-prep-protocols-for-illumina-sequencing/" target="_top">http://www.sagescience.com/blog/sanger-reports-improved-prep-protocols-for-illumina-sequencing/</a>).
	</p><p>
	  I have now seen MiSeq 250bp and 300bp paired-end genomic data sets
	  from different (trusted) sequencing providers for very well
	  characterised, non-complex and non-GC-extreme bacterial genomes with
	  up to 3% chimeric reads. To make things worse, some chimeras were
	  represented by both reads of a read-pair, so one had the exact same
	  chimeric sequence represented twice: once in forward and once in
	  reverse complement direction.
	</p><p>
	  It turned out that MIRA versions &#8804; 4.9.3 have problems in
	  filtering chimeras in Illumina data sets with reads &gt; 100bp as
	  the chimera detection algorithms were designed to handle amounts
	  much less than 1% of the total reads. This led to shorter contigs in
	  genomic assemblies and to chimeric transcripts (when they are very
	  low-coverage) in RNA assemblies.
	</p><p>
	  Note that projects using reads &#8804; 100 bp assembled fine with MIRA
	  4.9.3 and before as the default algorithms for proposed-end-clip
	  ([-CL:pec]) implicitly caught chimeras occurring near the
	  read ends and the remaining chimeras were caught by the algorithms
	  for low level chimeras.
	</p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	  MIRA 4.9.4 and higher eliminate all chimeras in Illumina reads of
	  any length, you do not need to take any precautionary steps
	  here. But if you use other assemblers and in light of the above, I
	  highly recommend to apply very stringent filters to Illumina data.
	  Especially for applications like metagenomics or RNA de-novo
	  assembly where low coverage may be expected for parts of the
	  results! Indeed, I now treat any assembly result with consensus data
	  generated from a coverage of less than 3 Illumina reads as
	  potentially junk data.
	</td></tr></table></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_sxa_lowlights_samplemix"></a>12.2.3.4. 
	  Sample barcode misidentification
	</h4></div></div></div><p>
	  Long story short: data from multiplexed samples contains "low"
	  amounts of foreign samples from the same lane. Probably not a
	  problem for high coverage assemblies, but can become a problem in
	  multiplexed RNASeq or projects looking for "rare" variants.
	</p><p>
	  In essence, the barcoding used for multiplexing several samples into
	  a single lane is not a 100% foolproof process. I found one paper
	  quantifying this effect to 0.3% of misidentified reads: Kircher et
	  al., <span class="emphasis"><em>Double indexing overcomes inaccuracies in multiplex
	  sequencing on the Illumina platform</em></span>; Nucleic Acids
	  Res. Jan 2012; 40(1): e3. <a class="ulink" href="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3245947/" target="_top">http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3245947/</a>
	</p><p>
	  For example, I got some genome sequecing data for a bacterium where
	  closer inspection of some small contigs coming out of the assembly
	  process turned out to be highly expressed genes from a plant. The
	  sequencing provider had multiplexed our bacterial sample with a
	  RNASeq project of that plant.
	</p><p>
	  Another example involved RNASeq of two genomes where one of the
	  organisms had been modified to contain additional genes under a
	  strong promoter. In the data set we suddenly saw those inserted
	  genes pop-up in the samples of the wild type organism. Which,
	  clearly, could not be.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_sxa_lowlights_nextera"></a>12.2.3.5. 
	  Nextera library prep
	</h4></div></div></div><p>
	  Opinions seem to be divided about Nextera: some people don't like it
	  as it introduces sometimes terrible coverage bias in the data, other
	  people say they're happy with the data.
	</p><p>
	  Someone told me (or wrote, I do not remember) that this divide may
	  be due to the fact that some people use their sequencing data for
	  de-novo assemblies, while others just do mappings and hunt for
	  SNPs. In fact, this would explain a lot: for de-novo assemblies, I
	  would never use Nextera. When on a hunt for SNPs, they may be OK.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_sxa_lowlights_gcbias"></a>12.2.3.6. 
	  Strong GC bias in some Solexa data (2nd half 2009 until advent of TruSeq kit at end of 2010)
	</h4></div></div></div><p>
	  I'm recycling a few slides from a couple of talks I held in 2010.
	</p><p>
	  Things used to be so nice and easy with the early Solexa data I worked
	  with (36 and 44mers) in late 2007 / early 2008. When sample taking was
	  done right -- e.g. for bacteria: in stationary phase -- and the
	  sequencing lab did a good job, the read coverage of the genome was
	  almost even. I did see a few papers claiming to see non-trivial GC
	  bias back then, but after having analysed the data I worked with I
	  dismissed them as "not relevant for my use cases." Have a look at the
	  following figure showing exemplarily the coverage of a 45% GC
	  bacterium in 2008:
	</p><div class="figure"><a name="sxa_gcbias_nobias2008.png"></a><p class="title"><b>Figure 12.5. 
	    Example for no GC coverage bias in 2008 Solexa data. Apart from a
	    slight <span class="emphasis"><em>smile shape</em></span> of the coverage --
	    indicating the sample taking was not 100% in stationary phase of the
	    bacterial culture -- everything looks pretty nice: the average
	    coverage is at 27x, and when looking at potential genome
	    duplications at twice the coverage (54x), there's nothing apart a
	    single peak (which turned out to be a problem in a rRNA region).
	  </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_gcbias_nobias2008.png" width="100%" alt="Example for no GC coverage bias in 2008 Solexa data. Apart from a slight smile shape of the coverage -- indicating the sample taking was not 100% in stationary phase of the bacterial culture -- everything looks pretty nice: the average coverage is at 27x, and when looking at potential genome duplications at twice the coverage (54x), there's nothing apart a single peak (which turned out to be a problem in a rRNA region)."></td></tr></table></div></div></div><br class="figure-break"><p>
	  Things changed starting sometime in Q3 2009, at least that's when I
	  got some data which made me notice a problem. Have a look at the
	  following figure which shows exactly the same organism as in the
	  figure above (bacterium, 45% GC):
	</p><div class="figure"><a name="sxa_gcbias_bias2009.png"></a><p class="title"><b>Figure 12.6. 
	    Example for GC coverage bias starting Q3 2009 in Solexa
	    data. There's no <span class="emphasis"><em>smile shape</em></span> anymore -- the
	    people in the lab learned to pay attention to sample in 100%
	    stationary phase -- but something else is extremely disconcerting:
	    the average coverage is at 33x, and when looking at potential genome
	    duplications at twice the coverage (66x), there are several dozen
	    peaks crossing the 66x threshold over a several kilobases (in one
	    case over 200 Kb) all over the genome. As if several small genome
	    duplications happened.
	  </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_gcbias_bias2009.png" width="100%" alt="Example for GC coverage bias starting Q3 2009 in Solexa data. There's no smile shape anymore -- the people in the lab learned to pay attention to sample in 100% stationary phase -- but something else is extremely disconcerting: the average coverage is at 33x, and when looking at potential genome duplications at twice the coverage (66x), there are several dozen peaks crossing the 66x threshold over a several kilobases (in one case over 200 Kb) all over the genome. As if several small genome duplications happened."></td></tr></table></div></div></div><br class="figure-break"><p>
	  By the way, the figures above are just examples: I saw over a dozen
	  sequencing projects in 2008 without GC bias and several dozen in 2009
	  / 2010 with GC bias.
	</p><p>
	  Checking the potential genome duplication sites, they all looked
	  "clean", i.e., the typical genome insertion markers are
	  missing. Poking around at possible explanations, I looked at GC
	  content of those parts in the genome ... and there was the
	  explanation:
	</p><div class="figure"><a name="sxa_gcbias_comp20082009.png"></a><p class="title"><b>Figure 12.7. 
	    Example for GC coverage bias, direct comparison 2008 / 2010
	    data. The bug has 45% average GC, areas with above average read
	    coverage in 2010 data turn out to be lower GC: around 33 to 36%. The
	    effect is also noticeable in the 2008 data, but barely so.
	  </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/sxa_gcbias_comp20082009.png" width="100%" alt="Example for GC coverage bias, direct comparison 2008 / 2010 data. The bug has 45% average GC, areas with above average read coverage in 2010 data turn out to be lower GC: around 33 to 36%. The effect is also noticeable in the 2008 data, but barely so."></td></tr></table></div></div></div><br class="figure-break"><p>
	  Now as to actually <span class="emphasis"><em>why</em></span> the GC bias suddenly
	  became so strong is unknown to me. The people in the lab use the same
	  protocol since several years to extract the DNA and the sequencing
	  providers claim to always use the Illumina standard protocols.
	</p><p>
	  But obviously something must have changed.
	</p><p>
	  It took Illumina some 18 months to resolve that problem for the
	  broader public: since data I work on were done with the TruSeq kit,
	  this problem has vanished.
	</p><p>
	  However, if you based some conclusions or wrote a paper with Illumina
	  data which might be affected by the GC bias (Q3 2009 to Q4 2010), I
	  suggest you rethink all the conclusion drawn. This should be
	  especially the case for transcriptomics experiments where a difference
	  in expression of 2x to 3x starts to get highly significant!
	</p></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_std_iontor"></a>12.3. 
      Ion Torrent
    </h2></div></div></div><p>
      As of January 2014, I would say Ion Torrent reads behave very much like
      late data from the 454 technology (FLX / Titanium chemistry): reads are
      on average are &gt; 300bp and the homopolymer problem is much less
      pronounced than 2 years ago. The following figure shows what you can get
      out of 100bp reads if you're lucky:
    </p><div class="figure"><a name="chap_iontor::ion_dh10bgoodB13.png"></a><p class="title"><b>Figure 12.8. 
	Example for good IonTorrent data (100bp reads). Note that only a
	single sequencing error - shown by blue background - can be
	seen. Except this, all homopolymers of size 3 and 4 in the area
	shown are good.
      </b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/ion_dh10bgoodB13.png" width="100%" alt="Example for good IonTorrent data (100bp reads). Note that only a single sequencing error - shown by blue background - can be seen. Except this, all homopolymers of size 3 and 4 in the area shown are good."></td></tr></table></div></div></div><br class="figure-break"><p>
      The "if you're lucky" part in the preceding sentence is not there by
      accident: having so many clean reads is more of an exception rather a
      rule. On the other hand, most sequencing errors in current IonTorrent
      data are unproblematic ... if it were not for indels, which is going to
      be explained on the next sections.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_std_iontor_hpindels"></a>12.3.1. 
	Homopolymer insertions / deletions
      </h3></div></div></div><p>
	The main source of error in your data will be insertions / deletions
	(indels) especially in homopolymer regions (but not only there, see
	also next section). Starting with a base run of 4 to 6 bases, there
	is a distinct tendency to have an increased occurrence of indel
	errors.
      </p><div class="figure"><a name="chap_iontor::iontor_indelhpexample.png"></a><p class="title"><b>Figure 12.9. 
	  Example for problematic IonTorrent data (100bp reads).
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/iontor_indelhpexample.png" width="100%" alt="Example for problematic IonTorrent data (100bp reads)."></td></tr></table></div></div></div><br class="figure-break"></div><p>
      The above figure contains a couple of particularly nasty indel
      problems. While areas 2 (C-homopolymer length 3), 5 (A-homopolymer
      length 4) and 6 (T-homopolymer length 3) are not a big problem as most
      of the reads got the length right, the areas 1, 3 and 4 are nasty.
    </p><p>
      Area 1 is an A-homopolymer of length 7 and while many reads get that
      length right (enough to tell MIRA what the true length is), it also
      contains reads with a length of 6 and and others with a length of 8.
    </p><p>
      Area 2 is a "A-homopolymer" of length 2 where approximately half of the
      reads get the length right, the other half not. See also the following
      section.
    </p><p>
      Area 4 is a T-homopolymer of length 5 which also has approximately half
      the reads with a wrong length of 4.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_std_iontor_seqdirdepindels"></a>12.3.2. 
	Sequencing direction dependent insertions / deletions
      </h3></div></div></div><p>
	In the previous section, the screen shot showing indels had an indel
	at a homopolymer of 2, which is something quite curious. Upon closer
	investigation, one might notice a pattern in the gap/nogap
	distribution: it is almost identical to the orientation of build
	direction of reads!
      </p><p>
	I looked for other examples of this behaviour and found quite a
	number of them, the following figure shows a very clear case of that
	error behaviour:
      </p><div class="figure"><a name="chap_iontor::ion_dh10bdirdepindel.png.png"></a><p class="title"><b>Figure 12.10. 
	  Example for a sequencing direction dependent indel. Note how all
	  but one of the reads in '+' direction miss a base while all reads
	  built in in '-' direction have the correct number of bases.
	</b></p><div class="figure-contents"><div class="mediaobject"><table border="0" summary="manufactured viewport for HTML img" style="cellpadding: 0; cellspacing: 0;" width="90%"><tr><td><img src="bookfigures/ion_dh10bdirdepindel.png" width="100%" alt="Example for a sequencing direction dependent indel. Note how all but one of the reads in '+' direction miss a base while all reads built in in '-' direction have the correct number of bases."></td></tr></table></div></div></div><br class="figure-break"><p>
	This is quite astonishing: the problem occurs at a site without real
	homopolymer (calling a 2-bases run a 'homopolymer' starts stretching
	the definition a bit) and there are no major problematic homopolymer
	sites near. In fact, this was more or less the case for all sites I
	had a look at.
      </p><p>
	Neither did the cases which were investigated show common base
	patterns, so unlike the Solexa GGCxG motif it does not look like
	that error of IonTorrent is bound to a particular motif.
      </p><p>
	While I cannot prove the following statement, I somehow suspect that
	there must be some kind of secondary structure forming which leads to
	that kind of sequencing error. If anyone has a good explanation I'd be
	happy to hear it: feel free to contact me at
	<code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code>.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_std_iontor_covvariance"></a>12.3.3. 
	Coverage variance
      </h3></div></div></div><p>
	The coverage variance with the old ~100bp reads was a bit on the
	bad side for low coverage projects (10x to 15x): it varied wildly,
	sometimes dropping to nearly zero, sometimes reaching approximately
	double the coverage.
      </p><p>
	This has now improved and I have not seen pronounced coverage variance
	in the data sets I have worked on.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_std_iontor_gcbias"></a>12.3.4. 
	GC bias
      </h3></div></div></div><p>
	The GC bias seems to be small to non-existent, at least I could not
	immediately make a correlation between GC content and coverage.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_std_iontor_other_sources_of_error"></a>12.3.5. 
	Other sources of error
      </h3></div></div></div><p>
	You will want to keep an eye on the clipping of the data in the SFF
	files from IonTorrent: while it is generally good enough, some data
	sets of IonTorrent show that - for some error patterns - the clipping
	is too lax and strange artefacts appear. MIRA will take care of these
	- or at least of those it knows - but you should be aware of this
	potential problem.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_std_iontor_where_to_find_further_information"></a>12.3.6. 
	Where to find further information
      </h3></div></div></div><p>
	IonTorrent being pretty new, getting as much information on that
	technology is quite important. So here are a couple of links I found
	to be helpful:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    There is, of course, the TorrentDev site (<a class="ulink" href="http://lifetech-it.hosted.jivesoftware.com/community/torrent_dev" target="_top">http://lifetech-it.hosted.jivesoftware.com/community/torrent_dev</a>)
	    at Life Technologies which will be helpful to get a couple of
	    questions answered.
	  </p><p>
	    Just be aware that some of the documents over there are sometimes
	    painting an - how should I say it diplomatically? - overly
	    optimistic view on the performance of the technology. On the
	    other hand, so do documents released by the main competitors
	    like 454/Roche, Illumina, PacBio etc. ... so no harm done there.
	  </p></li><li class="listitem"><p>
	    I found Nick Loman's blog <a class="ulink" href="http://pathogenomics.bham.ac.uk/blog/" target="_top">Pathogens: Genes and
	    Genomes</a> to be my currently most valuable source of
	    information on IonTorrent. While the group he works for won a
	    sequencer from IonTorrent, he makes that fact very clear and still
	    unsparingly dissects the data he gets from that machine.
	  </p><p>
	    His posts got me going in getting MIRA grok IonTorrent.
	  </p></li><li class="listitem"><p>
	    The blog of Lex Nederbragt <a class="ulink" href="http://flxlexblog.wordpress.com/" target="_top">In between lines of
	    code</a> is playing in the same league: very down to earth and
	    he knows a bluff when he sees it ... and is not afraid to call it
	    (be it from IonTorrent, PacBio or 454).
	  </p><p>
	    The analysis he did on a couple of Ion data sets have saved me
	    quite some time.
	  </p></li><li class="listitem"><p>
	    Last, but not least, the board with <a class="ulink" href="http://seqanswers.com/forums/forumdisplay.php?f=40" target="_top">IonTorrent-related-stuff</a>
	    over at <a class="ulink" href="http://seqanswers.com/" target="_top">SeqAnswers</a>,
	    the first and foremost one-stop-shop ... erm ... discussion board
	    for everything related to sequencing nowadays.
	  </p></li></ul></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_std_pacbio"></a>12.4. 
      Pacific BioSciences
    </h2></div></div></div><p>
      As of January 2014, PacBio should be seen as <span class="emphasis"><em>the</em></span>
      technology to go to for de-novo sequencing of bacteria and lower
      eukaryotes. Period. Complement it with a bit of Illumina to get rid of
      the last remaining errors and you'll have - for a couple of thousand
      Euros - the best genome sequences money can buy.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_std_pb_highlights"></a>12.4.1. 
	Highlights
      </h3></div></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_pb_hl_length"></a>12.4.1.1. 
	  Sequence lengths
	</h4></div></div></div><p>
	  Just one word: huge. At least compared to other currently existing
	  technologies. It is not unusual to get average - usable - read lengths
	  of more than 3 to 4 kb, some chemistries doubling that number (at
	  the expense of accuracy). The largest - usable - reads I have seen
	  were &gt; 25kb, though one needs to keep in mind that these are
	  quite rare and one does not see many of them in a project.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_pb_hl_gcbias"></a>12.4.1.2. 
	  GC bias
	</h4></div></div></div><p>
	  I have seen none in my projects so far, neither have I in public
	  data. But these were certainly not as many projects as Sanger, 454,
	  Illumina and Ion, so take this with a grain of salt.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_pb_hl_acccorrected"></a>12.4.1.3. 
	  Accuracy of corrected reads
	</h4></div></div></div><p>
	  Once the raw PacBio data has been corrected (HGAP pipeline), the
	  resulting reads have a pretty good accuracy. There still are
	  occasional homopolymer errors remaining at non-random locations, but
	  they are a minor problem.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_pb_hl_qualassemblies"></a>12.4.1.4. 
	  Assemblies of corrected reads
	</h4></div></div></div><p>
	  The assemblies coming out of the HGAP pipeline are already
	  astoundingly good. Of course you get long contigs, but also the
	  number of miscalled consensus bases is not too bad: 1 error per 20
	  kb. Once the program
	  <span class="command"><strong>Quiver</strong></span> went through the assembly to do its magic
	  in polishing, the quality improves further to into the range of 1
	  error per 50kb to 1 error per 250kb.
	</p><p>
	  In my hands, I get even better assemblies with MIRA (longer contigs
	  which span repeats unresolved by HGAP). When combining this with
	  some low coverage Illumina data (say, 50x) to do cheap polishing,
	  the error rates I get are lower than 1 error in 4 megabases.
	</p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	  Take the above with a grain of salt as at the time of this writing,
	  I analysed in-depth only on a couple of bacteria. For ploidal
	  organisms I have just played a bit around with public data without
	  really doing an in depth analysis there.
	</td></tr></table></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_std_pb_lowlights"></a>12.4.2. 
	Lowlights
      </h3></div></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_pb_ll_namingconfusion"></a>12.4.2.1. 
	  Naming confusion
	</h4></div></div></div><p>
	  With PacBio, there are quite a number of read types being thrown
	  around and which do confuse people: <span class="emphasis"><em>polymerase
	  reads</em></span>, <span class="emphasis"><em>quality clipped
	  reads</em></span>, <span class="emphasis"><em>subreads</em></span>, <span class="emphasis"><em>corrected
	  reads</em></span> and maybe some more I currently forgot. Here's the
	  total unofficial guide on how to keep those things apart:
	</p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	      <span class="bold"><strong>polymerase reads</strong></span> are the rawest
	      and most unedited stuff you may come into contact. You can see
	      it as "data fresh from the machine" and the number of megabases
	      there is usually the one sequencing providers sell to you.
	    </p><p>
	      The sequencing technology PacBio employs uses special hairpin
	      adaptors they have named SMRTBell, and these adaptors will be
	      present in the polymerase reads together with the fragments of
	      your DNA.
	    </p><p>
	      In terms of regular expression look-alike, the data in
	      polymerase reads has the following form:
	    </p><pre class="screen">(Adaptor + (forward fragment sequence + (Adaptor + (fragment sequence in reverse complement))))*</pre><p>
	      E.g., some of your <span class="emphasis"><em>polymerase reads</em></span> will
	      contain just the adaptor and (part of) a fragment sequence:
	      Adap+FwdSeq. Others might contain: Adap+FwdSeq+Adap+RevSeq. And
	      still others might contain: multiple copies of
	      Adap+FwdSeq+Adap+RevSeq.
	    </p></li><li class="listitem"><span class="bold"><strong>quality clipped reads</strong></span> are
	    simply <span class="emphasis"><em>polymerase reads</em></span> where some sort of
	    first quality clipping has been done.
	  </li><li class="listitem"><span class="bold"><strong>subreads</strong></span> are <span class="emphasis"><em>quality
	    clipped reads</em></span> where the adaptors have been removed and
	    the read split into forward fragment sequences and reverse
	    fragment sequences. Hence, one quality clipped polymerase read can
	    yield several subreads.
	  </li><li class="listitem"><p>
	      <span class="bold"><strong>corrected (sub)reads</strong></span> are
	      subreads where through the magic of lots of computational power
	      and a very high coverage of subreads, the errors have been
	      almost completely removed from the subreads.
	    </p><p>
	      This is usually done only on a part of the subreads as it takes
	      already long enough (several hundred hours CPU for a simple
	      bacterium).
	    </p></li></ul></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_pb_ll_revseq"></a>12.4.2.2. 
	  Forward / reverse chimeric sequences
	</h4></div></div></div><p>
	  The splitting of polymerase reads into subreads (see above) needs
	  the SMRTBell adaptor to be recognised by motif searching
	  programs. Unfortunately, it looks like as if some "low percentage"
	  of reads have a self-looped end instead of an adaptor. Which in turn
	  means that the subread splitting will not split those reads and you
	  end up with a chimeric sequence.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_pb_ll_rawreadaccuracy"></a>12.4.2.3. 
	  Accuracy of uncorrected subreads
	</h4></div></div></div><p>
	  You need to be brave now: the accuracy of the the unclipped
	  polymerase reads is usually only about 50%. That is: on average
	  every second base is wrong. And I have seen a project where this
	  accuracy was only 14% (6 out of 7 bases are wrong).
	</p><p>
	  After clipping, the average accuracy of the polymerase reads should
	  be anywhere between 80% and 85% (this depends a little bit on the
	  chemistry used), which translates to: every 5th to every 7th base is
	  wrong. The vast majority of errors being insertions or deletions, not
	  base substitutions.
	</p><p>
	  80% to 85% accurracy with indels as primary error is unfortunately
	  something assemblers cannot use very well. Read: not at all if you
	  want good assemblies (at least I know no program which does
	  that). Therefore, one needs to apply some sort of correction
	  ... which needs quite a deal of CPU, see below.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_pb_ll_cpu"></a>12.4.2.4. 
	  Immense need for CPU power
	</h4></div></div></div><p>
	  The above mentioned accuracies of 80% to 85% are too low for any
	  existing assembler I know to be correctly assembled. Therefore,
	  people came up with the idea of doing error correction on subreads
	  to improve their quality.
	</p><p>
	  There are two major approaches: 1) correcting PacBio subreads with
	  other technologies with shorter reads and 2) correcting long PacBio
	  subreads with shorter PacBio subreads. Both approaches have been
	  shown to work, though there seems to be a preference nowadays to use
	  the second option as the "shorter" PacBio reads provide the benefit
	  of being still longer than read from other technologies and hence
	  provide a better repeat resolution.
	</p><p>
	  Anyway, the amount of CPU power needed for any method above is
	  something to keep for: bacteria with 3 to 5 megabases at a 100x
	  polymerase read coverage can take several hundred hours of CPU for
	  the correction step.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_std_pb_ll_dnaprep"></a>12.4.2.5. 
	  Increased quality requirements for clean DNA sample prep
	</h4></div></div></div><p>
	  This is a problem which cannot be really attributed to PacBio: one
	  absolutely needs to check whether the protocols used "since ever"
	  for DNA extraction yield results which are clean and long enough for
	  PacBio. Often they are not.
	</p><p>
	  The reason for this being a problem is simple: PacBio can sequence
	  really long fragments, but if your DNA extraction protocol smashed
	  the DNA into small pieces, then no sequencing technology in this
	  universe will be able to give you long reads for small fragments.
	</p></div></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_seqadvice"></a>Chapter 13. Some advice when going into a sequencing project</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_seqadv_seqprovider">13.1. 
      Talk to your sequencing provider(s) before sequencing
    </a></span></dt><dt><span class="sect1"><a href="#sect_seqadv_whichseqprovider">13.2. 
      Choosing a sequencing provider
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_seqadv_whichseqprovider_want">13.2.1. 
	WHAT DO YOU WANT?!
      </a></span></dt><dt><span class="sect2"><a href="#sect_seqadv_whichseqprovider_need">13.2.2. 
	WHAT DO YOU NEED?!
      </a></span></dt><dt><span class="sect2"><a href="#sect_seqadv_whichseqprovider_cost">13.2.3. 
	WHAT WILL IT COST ME?
      </a></span></dt><dt><span class="sect2"><a href="#sect_seqadv_whichseqprovider_where">13.2.4. 
	WHERE TO SEQUENCE?
      </a></span></dt><dt><span class="sect2"><a href="#sect_seqadv_whichseqprovider_summary">13.2.5. 
	Summary of all the above
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_seqadv_specific">13.3. 
      Specific advice
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_seqadv_technologies">13.3.1. 
	Technologies
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_seqadv_technologies_sanger">13.3.1.1. 
	  Sanger
	</a></span></dt><dt><span class="sect3"><a href="#sect_seqadv_technologies_pacbio">13.3.1.2. 
	  Pacific Biosciences
	</a></span></dt><dt><span class="sect3"><a href="#sect_seqadv_technologies_illumina">13.3.1.3. 
	  Illumina
	</a></span></dt><dt><span class="sect3"><a href="#sect_seqadv_technologies_iontorrent">13.3.1.4. 
	  Ion Torrent
	</a></span></dt><dt><span class="sect3"><a href="#sect_seqadv_technologies_454">13.3.1.5. 
	  Roche 454
	</a></span></dt></dl></dd><dt><span class="sect2"><a href="#sect_seqadv_denovo">13.3.2. 
	Sequencing de-novo
      </a></span></dt><dt><span class="sect2"><a href="#sect_seqadv_mapping">13.3.3. 
	Re-sequencing / mapping
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_seqadv_a_word_or_two_on_coverage">13.4. 
      A word or two on coverage ...
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_seqadv_lowcov">13.4.1. 
	Low coverage isn't worth it
      </a></span></dt><dt><span class="sect2"><a href="#sect_seqadv_highcov">13.4.2. 
	Catch-22: too high coverage
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_seqadv_when_sequencing_a_word_of_caution_regarding_your_dna">13.5. 
      A word of caution regarding your DNA in hybrid sequencing projects
    </a></span></dt><dt><span class="sect1"><a href="#sect_seqadv_for_bacteria">13.6. 
      Advice for bacteria
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_seqadv_for_bacteria_no_not_sample_in_exponential_phase">13.6.1. 
	Do not sample DNA from bacteria in exponential growth phase!
      </a></span></dt><dt><span class="sect2"><a href="#sect_seqadv_for_bacteria:_beware_of_high_copy_number_plasmids">13.6.2. 
	Beware of (high copy number) plasmids!
      </a></span></dt></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em>
	<span class="quote">&#8220;<span class="quote">
	  Reliable information lets you say 'I don't know' with real confidence.
	</span>&#8221;</span>
      </em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_seqadv_seqprovider"></a>13.1. 
      Talk to your sequencing provider(s) before sequencing
    </h2></div></div></div><p>
      Well, duh! But it's interesting what kind of mails I sometimes get. Like in:
    </p><div class="blockquote"><blockquote class="blockquote"><span class="quote">&#8220;<span class="quote">We've sequenced a one gigabase, diploid eukaryote with
    Solexa 36bp paired-end with 200bp insert size at 25x coverage. Could you
    please tell us how to assemble this data set de-novo to get a finished
    genome?</span>&#8221;</span></blockquote></div><p>
      A situation like the above should have never happened. Good sequencing
      providers are interested in keeping customers long term and will
      therefore try to find out what exactly your needs are. These folks
      generally know their stuff (they're making a living out of it) and most
      of the time propose you a strategy that fulfills your needs for a near
      minimum amount of money.
    </p><p>
      Listen to them.
    </p><p>
      If you think they try to rip you off or are overselling their
      competences (which most providers I know won't even think of trying,
      but there are some), ask a quote from a couple of other
      providers. You'll see pretty quickly if there are some things not being
      right.
    </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      As a matter of fact, a rule which has saved me time and again for
      finding sequencing providers is not to go for the cheapest provider,
      especially if their price is far below quotes from other
      providers. They're cutting corners somewhere others don't cut for a
      reason.
    </td></tr></table></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_seqadv_whichseqprovider"></a>13.2. 
      Choosing a sequencing provider
    </h2></div></div></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      This is a slightly reworked version of a post I made on the MIRA talk
      mailing list.  The question <span class="emphasis"><em>"Could you please recommend me a
      sequencing provider?"</em></span> arrives every now and then in my
      private inbox, often enough for me decide to make a collage of the
      responses I gave in the past and post it to MIRA talk.
    </td></tr></table></div><p>
      This response got, errrr, a little longer, but allow me to note that I
      will not give you names. The reasons are manyfold:
    </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem">
	once upon a time I worked for a sequencing company
      </li><li class="listitem">
	the company I am currently employed with is not in the sequencing
	provider business, but the company uses more than one sequencing
	provider on a regular base and I get to see quite some data
      </li><li class="listitem">
	due to my development on MIRA in my free time, I'm getting insight
	into a number of highs and lows of sequencing technologies at
	different sequencing providers which I would not get if I were to
	expose them publicly ... I do not want to jeopardise these
	relationships.
      </li></ul></div><p>
      That being said, there are a number of general considerations which
      could help you. Excuse me in case the detours I am going to make are
      obvious to you, but I'm writing this also for future references. Also,
      please bear with me if I look at "sequencing" a bit differently than you
      might be accustomed to from academia, but I have worked for quite some
      time now in industry ... and there cost-effectiveness respectively
      "probability of success" of a project as whole is paramount to
      everything else. I'll come back to that further down.
    </p><p>
      There's one -- and only one -- question which you, as sequencing
      customer, need to be able to answer ... if necessary in every
      excruciating detail, but you must know the answer. The question is:
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_whichseqprovider_want"></a>13.2.1. 
	WHAT DO YOU WANT?!
      </h3></div></div></div><div class="sidebar"><div class="titlepage"><div><div><p class="title"><b>
	  Detour - Sequencing -
	</b></p></div></div></div><p>
	  For me, every "sequencing project", be it genomic or transcriptomic,
	  really consists of four major phases:
	</p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	      <span class="bold"><strong>data generation:</strong></span> This can be
	      broadly seen as everything to get the DNA/RNA ready to be sent
	      off to sequencing (usually something the client does), the
	      library prep at the sequencing provider and finally the
	      sequencing itself (including base calling). An area of thousand
	      pitfalls where each step (and the communication) is crucial and
	      even one slight inadvertence can make the difference between a
	      "simple" project and a "hard" project. E.g.: taking DNA from
	      growing cells (especially bacteria in exponential growing phase)
	      might not be a good idea ... it makes assembly more
	      difficult. Some DNA extraction methods generate more junk than
	      good fragments etc.pp
	    </p><p>
	      The reason I am emphasizing this is simple: nowadays, the
	      "sequencing" itself is not the most expensive part of a
	      sequencing project, the next two steps are (most of the time
	      anyway).
	    </p></li><li class="listitem"><p>
	      <span class="bold"><strong>assembly &amp; finishing:</strong></span> Still
	      a hard problem. Even a "simple" bacterium can present weeks of
	      effort to get right if its riddled with phages, prophages,
	      transposon elements, genetically engineered repeats etc.pp. And
	      starting with eukaryotes the real fun starts: ploidy,
	      retrotransposons etc. make for an unbelievable genome plasticity
	      and almost always have their own surprises. I've seen "simple"
	      Saccharomyces cerevisiae - where biologist swore to high heaven
	      they were "close to the publicly sequenced strains" - being
	      *very* different from what they were expected to be, both on the
	      DNA level and the genome organisation level.
	    </p><p>
	      Getting eukaryotes right "down to the last base" might cost
	      quite some money, especially when looping back to step 1 (data
	      generation) to tackle difficult areas.
	    </p></li><li class="listitem"><p>
	      <span class="bold"><strong>annotation:</strong></span> Something many
	      people forget: give the sequence a meaning. Here too, things can
	      get quite costly if done "right", i.e., with hand
	      curation. Especially on organism which are not part of the more
	      commonly sequenced species or are generally more complex.
	    </p><p>
	      Annotation of a de-novo transcriptome assembly is also not for
	      the faint of heart, especially if done on short, unpaired read
	      assemblies.
	    </p></li><li class="listitem"><span class="bold"><strong>using the sequencing data:</strong></span>
	    ... for whatever it was generated for.
	  </li></ol></div></div><p>
	The above makes it clear that, depending on what you are really
	interested in within your project and what you expect to be able to do
	with the sequencing data, one can cut corners and reduce cost here and
	there (but not everywhere). And therefore, the above question "What do
	you want?" is one which - after the initial chit-chat of "hi, hello,
	nice to meet you, a pleasure to be here, etc." - every good
	representative of respectable sequencing providers I have met so far
	will ask as very first question. Usually in the form of "what do you
	want to sequence and what will you want to use the data for (and what
	not)?"
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_whichseqprovider_need"></a>13.2.2. 
	WHAT DO YOU NEED?!
      </h3></div></div></div><p>
	... difference between "want" and "need" ...
      </p><p>
	Every other question - like where to sequence, which sequencing
	technology to use, how to process the sequencing data afterwards - is
	incidental and subordinated to your answer(s) to the question of "what
	do you want?!" But often sequencing customers get their priorities
	wrong by putting forward another question:
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_whichseqprovider_cost"></a>13.2.3. 
	WHAT WILL IT COST ME?
      </h3></div></div></div><p>
	And its inevitable companion question "Can you make it cheaper?"
      </p><div class="sidebar"><div class="titlepage"><div><div><p class="title"><b>
	  Detour - Putting things into perspective -
	</b></p></div></div></div><p>
	  Come to think of it, people sometimes have very interesting ideas
	  regarding costs. Interesting as in "outright silly." It may be
	  because they do not really know what they want or feel unsure on a
	  terrain unbeknownst to them, and often instead focus their energy on
	  single aspects of a wider project because they feel more at home
	  there. And suddenly the focus lies on haggling and bartering for
	  some prices because, after all, this is something everyone knows how
	  to do, right?
	</p><p>
	  As I hinted earlier, the pure sequencing costs are nowadays probably
	  not the biggest factor in any sequencing project: 454, Illumina,
	  IonTorrent and other technology providers have seen to that. E.g.,
	  in 20043/2004 it still cost somewhere between 150 - 200 k&#8364; to get an
	  8x Sanger coverage of a moderately sized bacterium (4 to 5
	  mb). Nowadays, for the same organism, you get coverages in the
	  dozens (going with 454) for a few thousand Euro ... or coverages in
	  the hundreds or even thousands (going with Illumina) for a few
	  hundred Euro.
	</p><p>
	  Cost for assembly, finishing and annotation have not followed the
	  same decrease. Yes, advances in algorithms have made things easier
	  in some parts, but not really on the same scale. Furthermore, the
	  "short read" technologies have more than made up for algorithmical
	  complexity when compared to the old Sanger reads. Maybe that
	  "(ultra)long read" technologies will alleviate the problem, but I
	  would not hold my breath for them to really work well.
	</p><p>
	  One thing however has almost not changed at all: your costs of
	  actually doing followup experiments and data interpretation!
	  Remember that sequencing in itself is most of the time not the
	  ultimate goal, you actually want to gain something out of it. Be it
	  abstract knowledge for a paper or concrete hints for producing some
	  compounds or whatever, chances are that you will actually devote a
	  substantial amount of your resources (time, manpower, mental health)
	  into followup activities (lab experiments, genetic engineering,
	  writing papers) to turn the abstract act of sequencing into
	  something tangible, be it papers, fame, new products, money, or
	  whatever you want to achieve.
	</p><p>
	  And this is the place where it pays to stop and think: "what do I
	  want? what are my strengths and where are my weaknesses? where are
	  my priorities?" The English have a nice saying: "Being penny-wise
	  and pound-foolish is not wise." I may add: Especially not if you are
	  basing man months / years of lab work and your career on the outcome
	  of something like sequencing. Maybe I'm spoiled because I have left
	  academia for quite some time now, but in sequencing I always prefer
	  to throw a bit more money at the sequencing process itself to
	  minimise risks of the later stages.
	</p></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_whichseqprovider_where"></a>13.2.4. 
	WHERE TO SEQUENCE?
      </h3></div></div></div><p>
	There's one last detour I'd like to make, and that is the question of "where to sequence?"
      </p><div class="sidebar"><div class="titlepage"><div><div><p class="title"><b>
	  Detour - Public or private, old-timers or young-timers ? -
	</b></p></div></div></div><p>
	  Choosing a sequencing provider is highly dependent on your answer to
	  "what do you want?" In case you want to keep the sequencing data (or
	  the very act of sequencing) secret (even only for some time) will
	  probably lead you to commercial sequencing companies. There you more
	  or less have complete control on the data. Paranoid people might
	  perhaps argue that you can have that only with own sequencing
	  equipment and personnel, but I have the feeling that only a minority
	  is able to cough-up the necessary money for purchasing sequencing
	  equipment for a small one-time project.
	</p><p>
	  Instead of companies you could however also look whether one of the
	  existing sequencing centers in the world might be a good cooperation
	  candidate. Especially if you are doing this project within the scope
	  of your university. Note however that there might be a number of
	  gotchas lurking there, beside the obvious "the data is not really
	  secret anymore": sometimes the raw sequencing data needs to be
	  publicly released, maybe earlier than you would like; or the
	  sequencing center imposes that each and every paper you publish with
	  that data as basis has them as (co-)first author.
	</p><p>
	  A related problem is "whom do I trust to deliver good work?"
	  Intuition says that institutes with a long sequencing history have
	  amassed quite some knowledge in this field, making them experts in
	  all three aspects (data generation, assembly &amp; finishing,
	  annotation) of a sequencing project ... and intuition probably isn't
	  wrong there. The same thing is probably true for sequencing
	  companies which have existed for more than just a couple of years,
	  though from what I have seen so far is that - due to size -
	  sequencing companies sometimes really focus on the data generation
	  and rely on partner companies for "assembly" and "annotation". This
	  is not to say that younger companies are bad. Incidentally, it is my
	  belief that in this field, people are still more important than
	  technology ... and every once in a while good people split off a
	  well known institute (or company) to try their luck in an own
	  company. Always look for references there.
	</p><p>
	  The following statement is a personal opinion (and you can call me
	  biased for that): Personally, I am however quite wary of sequencing
	  done at locations where a sequencer exists because someone got a
	  grant to buy one (because it was chic &amp; en-vogue to get a shiny
	  new toy) but where the instrument then slowly starts to collect dust
	  after the initial flurry ... and because people often do not
	  calculate chemistry costs which arise in case they'd really thought
	  of using the machine 24/7. I want to know that technicians actually
	  work with those things every day, that they know the ins and outs of
	  the work, the protocols, the chemistry, the moods of the machine
	  (even an instrument can have a bad day). I honestly do not believe
	  that one can build up enough expertise when operating these things
	  "every once in a while".
	</p></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_whichseqprovider_summary"></a>13.2.5. 
	Summary of all the above
      </h3></div></div></div><p>
	All of the above means that depending on what I need the data for, I
	have the freedom choose among different providers. In case I just need
	masses of raw data and potential savings are substantial, I might go
	with the cheapest whom I know to generate good data. If I want good
	service and second round of data in case I am not 110% satisfied with
	the first round (somehow people have stopped questioning me there),
	this is usually not the cheapest provider ... but the additional costs
	are not really high. If I wanted my data really really quick, I'd
	search for a provider with Ion Torrent, or MiSeq (I am actually
	looking for one with a MiSeq, so if anyone knows a good one,
	preferably in Europe -&gt; mail me). Though I already did transcriptomics
	on eukaryotes, in case I needed larger eukaryotes assembled de-novo
	&amp; also annotated, I would probably look for the help of a larger
	sequencing center as this starts to get dangerously near the fringe of
	my field of expertise.
      </p><p>
	In closing this part, here are a couple of guidelines which have not
	failed me so far for choosing sequencing providers:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem">
	  Building a good relationship helps. In case your institute /
	  university already has good (or OK) experience with a provider, ask
	  there first.
	</li><li class="listitem">
	  It is a lot easier to build a good relationship with someone who
	  speaks your language ... or a good(!) English.
	</li><li class="listitem">
	  I will not haggle for a couple of hundred Euros in a single project,
	  I'll certainly reconsider this when savings are in the tens of
	  thousands.
	</li><li class="listitem">
	  Managing expectations: some sequencing projects are high risk from
	  the start, for lots of possible reasons (underfunded, bad starting
	  material, unclear organism). This is *sometimes* (!) OK as long as
	  everyone involved knows and acknowledges this. However, you should
	  always have a clear target ("what am I looking for?") and preferably
	  know in advance how to treat the data to get there.
	</li><li class="listitem">
	  Errors occur, stay friendly at first. In case the expectations were
	  clear (see above), the material and organism are not at fault but
	  the data quality somehow is bad, it is not too difficult to have the
	  sequencing provider acknowledge this and get additional sequencing
	  for no added cost.
	</li></ul></div><p>
	Regarding the technologies you can use ... it really depends on what
	you want to do :-) And note that I base my answers on technologies
	available today without bigger problems: PacBio, Illumina, with
	IonTorrent as Joker for quick projects. 454 can still be considered,
	but probably not for too long anymore as Roche stopped development of
	the technology and thus PacBio takes over the part for long
	reads. Oxford Nanopore might become a game changer, but they are not
	just yet
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_seqadv_specific"></a>13.3. 
      Specific advice
    </h2></div></div></div><p>
      Here's how I see things as of now (January 2014), which might not
      necessarily be how others see them.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_technologies"></a>13.3.1. 
	Technologies
      </h3></div></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_seqadv_technologies_sanger"></a>13.3.1.1. 
	  Sanger
	</h4></div></div></div><p>
	  Use for: checking assemblies; closing gaps by PCR; checking for a couple of genes with
	  known sequence (i.e., where you can design oligos for).
	</p><p>
	  Do not use for: anything else. In particular, if you find yourself
	  designing oligos for a 96 well plate destined for Sanger sequencing
	  of a single bacterial DNA sample, you (probably) are doing something
	  wrong.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_seqadv_technologies_pacbio"></a>13.3.1.2. 
	  Pacific Biosciences
	</h4></div></div></div><p>
	  Use for: de-novo of bacteria and lower eukaryotes (or higher
	  eukaryotes if you have the money). PacBio should be seen as
	  <span class="emphasis"><em>the</em></span> technology to use when getting the best
	  assemblies with least number of contigs is important to you. Also,
	  resequencing of variants of known organisms with lots of genomic
	  reorganisation flexibility due to high numbers of transposons (where
	  short reads will not help in getting the chromosome assembled/mapped
	  correctly).
	</p><p>
	  Do not use for: resequencing of "dull" organisms (where the only
	  differences will be simple SNPs or simple insertion/deletions or
	  simple contig reorganisations at non-repetitive places). Illumina
	  will do a much better and cost effective job there.
	</p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top"><p>
	    As of January 2014: aim for at least 100x coverage of raw data,
	    better 130x to 150x as pre-processing (quality clip, removal of
	    adapters and other sequencing artefacts) will take its toll and
	    reduce the data by up to 1/3. After that, the error
	    correction/self-correction of raw reads into corrected reads will
	    again reduce the data considerably.
	  </p><p>
	    It's really a numbers game: the more data you have, the more
	    likely you will also get many of those really long reads in the 5
	    to 30 Kb range which are extremely useful to get over those nasty
	    repeats.
	  </p></td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
	  MIRA will most probably give you longer contigs with corrected
	  PacBio reads than you get with the HGAP pipeline, but the number of
	  indel errors will currently be higher. Either use Quiver on the
	  results of MIRA ... or simply polish the assembly with a cheap
	  Illumina data set. The latter approach will also give you better
	  results than a Quiver approach.
	</td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	  For non-haploid organisms, you might need more coverage to get
	  enough data at ploidy sites to get the reads correctly out of
	  error correction.
	</td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	  Preparation of your DNA sample is not trivial as many methods will
	  break your DNA into "small" chunks which are good enough for
	  Sanger, 454, Illumina or Ion Torrents, but not for PacBio.
	</td></tr></table></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_seqadv_technologies_illumina"></a>13.3.1.3. 
	  Illumina
	</h4></div></div></div><p>
	  Use for: general resequencing jobs (finding SNPs, indel locations of
	  any size, copy number variations etc.); gene expression analysis;
	  cheap test sequencing of unknown organisms to assess complexity;
	  de-novo sequencing if you are OK with getting hundreds / thousands
	  of contigs (depending on organism, some bacteria get only a few
	  dozen).
	</p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	  Careful with high GC organisms, starting with 60% to 65% GC Illumina
	  reads contain more errors: SNP detection may be less reliable if
	  extreme care is not taken to perform good read clipping. Especially
	  the dreaded GGCxG motif often leads to problems in Illumina reads.
	</td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	  For de-novo assemblies, do <span class="emphasis"><em>NOT</em></span> (never ever at
	  all and under no circumstances) use the Nextera kit, take
	  TruSeq. The non-random fragmentation behaviour of Nextera leads to
	  all sorts of problems for assemblers (not only MIRA) which try to
	  use kmer frequencies as a criterion for repetitiveness of a given
	  sequence.
	</td></tr></table></div></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_seqadv_technologies_iontorrent"></a>13.3.1.4. 
	  Ion Torrent
	</h4></div></div></div><p>
	  Use for: like Illumina. With three notable exceptions: 1) SNP
	  detection is not as good as with Illumina (more false positives and
	  false negatives) 2) de-novo assemblies will contain more single-base
	  indels and 3) Ion having problems with homopolymers, that technology
	  is not as well suited as complimentary hybrid technology for PacBio
	  as is Illumina (except for high-GC perhaps).
	</p><p>
	  Ion has a speed advantage on Illumina: if you have your own machine,
	  getting from your sample to data takes less time than with Illumina.
	</p><p>
	  Also, it looks like as if Ion has less problems with GC content or
	  sequence motifs than Illumina.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_seqadv_technologies_454"></a>13.3.1.5. 
	  Roche 454
	</h4></div></div></div><p>
	  That technology is on the way out, but there may be two reasons to
	  not completely dismiss 454: 1) the average read length of 700 bp can
	  be seen as a plus when compared to Illumina or Ion ... but then
	  there's PacBio to take care of read length. 2) the large read-pair
	  libraries work better with 454 than Illumina mate-pair libraries,
	  something which might be important for scaffolding data where even
	  PacBio could not completely resolve long repeats.
	</p></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_denovo"></a>13.3.2. 
	Sequencing de-novo
      </h3></div></div></div><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem">
	  On a cheap gene fishing expedition? Probably Illumina HiSeq, at
	  least 100bp, 150 to 250bp or 300bp if your provider supports it
	  well. Paired-end definitely a plus. As alternative: Ion Torrent for
	  small organism (maybe up to 100Mb) and when you need results quickly
	  without caring for possible frameshifts.
	</li><li class="listitem">
	  Want some larger contigs? PacBio. Add in cheap Illumina 100bp
	  paired-end (150 to 300bp if provider supports it) to get rid of
	  those last frameshifts which may remain.
	</li><li class="listitem">
	  Maybe scaffolding of contigs above? PacBio + Illumina 100bp + a
	  large paired-end library (e.g. 454 20kb)
	</li><li class="listitem">
	  Have some good friends at Oxford Nanopore who can give you some
	  MinIon engineering samples? Man, I'd kill for some bacterial test
	  sets with those (especially Bacillus subtilis 168)
	</li></ul></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_mapping"></a>13.3.3. 
	Re-sequencing / mapping
      </h3></div></div></div><p>
	There is a reason why Illumina currently dominates the market as it
	does: a cheap Illumina run (preferably paired-end) will answer most of
	your questions in 99% of the cases. Things will get difficult for
	organisms with high numbers of repeats and/or frequent genome
	re-arrangements. Then using longer read technologies and/or Illumina
	mate-pair may be required.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_seqadv_a_word_or_two_on_coverage"></a>13.4. 
      A word or two on coverage ...
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_lowcov"></a>13.4.1. 
	Low coverage isn't worth it
      </h3></div></div></div><p>
	There's one thing to be said about coverage and de-novo assembly:
	especially for bacteria, getting more than 'decent' coverage is
	<span class="emphasis"><em>cheap</em></span> with any current day technology. Every
	assembler I know will be happy to assemble de-novo genomes with
	coverages of 25x, 30x, 40x ... and the number of contigs will still
	drop dramatically between a 15x Ion Torrent and a 30x Ion Torrent
	project.
      </p><p>
	In any case, do some calculations: if the coverage you expect to get
	reaches 50x (e.g. 200MB raw sequence for a 4MB genome), then you
	(respectively the assembler) can still throw away the worst 20% of the
	sequence (with lots of sequencing errors) and concentrate on the
	really, really good parts of the sequences to get you nice contigs.
      </p><p>
	Other example: the price for 1 gigabase Illumina paired-end of a
	single DNA prep is way, way below USD 1000, even with commercial
	providers. Then you just need to do the math: is it worth to invest
	10, 20, 30 or more days of wet lab work, designing primers, doing PCR
	sequencing etc. and trying to close remaining gaps or hunt down
	sequencing errors when you went for a 'low' coverage or a non-hybrid
	sequencing strategy? Or do you invest a few bucks more to get some
	additional coverage and considerably reduce the uncertainties and gaps
	which remain?
      </p><p>
	Remember, you probably want to do research on your bug and not
	research on how to best assemble and close genomes. So even if you put
	(PhD) students on the job, it's costing you time and money if you
	wanted to save money earlier in the sequencing. Penny-wise and
	pound-foolish is almost never a good strategy :-)
      </p><p>
	I do agree that with eukaryotes, things start to get a bit more
	interesting from the financial point of view.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_highcov"></a>13.4.2. 
	Catch-22: too high coverage
      </h3></div></div></div><p>
	There is, however, a catch-22 situation with coverage: too much
	coverage isn't good either. Without going into details: sequencing
	errors sometimes interfere heavily when coverage exceeds ~60x to 80x
	for 454 &amp; IonTorrent and approximately 150x to 200x for
	Solexa/Illumina.
      </p><p>
	In those cases, do yourself a favour: there's more than enough data
	for your project ... just cut it down to some reasonable amount: 40x
	to 50x for 454 &amp; IonTorrent, 100x for Solexa/Illumina.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_seqadv_when_sequencing_a_word_of_caution_regarding_your_dna"></a>13.5. 
      A word of caution regarding your DNA in hybrid sequencing projects
    </h2></div></div></div><p>
      So, you have decided that sequencing your bug with PacBio and Illumina
      (or PacBio and Ion Torrent or whatever) may be a viable way to get the
      best bang for your buck. Then please follow this advice: prepare enough
      DNA <span class="emphasis"><em>in</em></span> <span class="emphasis"><em>one</em></span>
      <span class="emphasis"><em>go</em></span> for the sequencing provider so that they can
      sequence it with all the technologies you chose without you having to
      prepare another batch ... or even grow another culture!
    </p><p>
      The reason for that is that as soon as you do that, the probability that
      there is a mutation somewhere that your first batch did not have is not
      negligible. And if there is a mutation, even if it is only one base,
      there is a &gt;95% chance that MIRA will find it and thinks it is some
      repetitive sequence (like a duplicated gene with a mutation in it) and
      splits contigs at those places.
    </p><p>
      Now, there are times when you cannot completely be sure that different
      sequencing runs did not use slightly different batches (or even strains).
    </p><p>
      One example: the SFF files for SRA000156 and SRA001028 from the NCBI
      short trace archive should both contain E.coli K12 MG-16650 (two
      unpaired half plates and a paired-end plate). However, they contain
      DNA from different cultures. Furthermore, the DNA was prepared by
      different labs. The net effect is that the sequences in the paired-end
      library contain a few distinct mutations from the sequences in the two
      unpaired half-plates. Furthermore, the paired-end sequences contain
      sequences from phages that are not present in the unpaired sequences.
    </p><p>
      In those cases, provide strain information to the reads so that MIRA can
      discern possible repeats from possible SNPs.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_seqadv_for_bacteria"></a>13.6. 
      Advice for bacteria
    </h2></div></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_for_bacteria_no_not_sample_in_exponential_phase"></a>13.6.1. 
	Do not sample DNA from bacteria in exponential growth phase!
      </h3></div></div></div><p>
	The reason is simple: some bacteria grow so fast that they start
	replicating themselves even before having finished the first
	replication cycle. This leads to more DNA around the origin of
	replication being present in cells, which in turn fools assemblers and
	mappers into believing that those areas are either repeats or that
	there are copy number changes.
      </p><p>
	Sample. In. Stationary. Phase!
      </p><p>
	For de-novo assemblies, MIRA will warn you if it detects data which
	points at exponential phase. In mapping assemblies, look at the
	coverage profile of your genome: if you see a smile shape (or V
	shape), you have a problem.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_seqadv_for_bacteria:_beware_of_high_copy_number_plasmids"></a>13.6.2. 
	Beware of (high copy number) plasmids!
      </h3></div></div></div><p>
	This is a source of interesting problems and furthermore gets people
	wondering why MIRA sometimes creates more contigs than other
	assemblers when it usually creates less.
      </p><p>
	Here's the short story: there are data sets which include one ore
	several high-copy plasmid(s). Here's a particularly ugly example:
	SRA001028 from the NCBI short read archive which contains a plate of
	paired-end reads for Ecoli K12 MG1655-G
	(<a class="ulink" href="ftp://ftp.ncbi.nih.gov/pub/TraceDB/ShortRead/SRA001028/" target="_top">ftp://ftp.ncbi.nih.gov/pub/TraceDB/ShortRead/SRA001028/</a>).
      </p><p>
	The genome is sequenced at ~10x coverage, but during the assembly,
	three intermediate contigs with ~2kb attain a silly maximum coverage
	of ~1800x each.  This means that there were ~540 copies of this
	plasmid (or these plasmids) in the sequencing.
      </p><p>
	When using the uniform read distribution algorithm - which is switched
	on by default when using "--job=" and the quality level of 'accurate' -
	MIRA will find out about the average coverage of the genome to be at
	~10x.  Subsequently this leads MIRA to dutifully create ~500 additional
	contigs (plus a number of contig debris) with various incarnations of
	that plasmid at an average of ~10x, because it thought that these were
	repetitive sites within the genome that needed to be disentangled.
      </p><p>
	Things get even more interesting when some of the plasmid / phage
	copies are slightly different from each other. These too will be split
	apart and when looking through the results later on and trying to join
	the copies back into one contig, one will see that this should not be
	done because there are real differences.
      </p><p>
	DON'T PANIC!
      </p><p>
	The only effect this has on your assembly is that the number of
	contigs goes up. This in turn leads to a number of questions in my
	mailbox why MIRA is sometimes producing more contigs than Newbler (or
	other assemblers), but that is another story (hint: Newbler either
	collapses repeats or leaves them completely out of the picture by not
	assembling repetitive reads).
      </p><p>
	What you can do is the following:
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    either you assemble everything together and the join the plasmid
	    contigs manually after assembly, e.g. in gap4 (drawback: on really
	    high copy numbers, MIRA will work quite a bit longer ... and you
	    will have a lot of fun joining the contigs afterwards)
	  </p></li><li class="listitem"><p>
	    or, after you found out about the plasmid(s) and know the sequence,
	    you filter out reads in the input data which contain this sequence
	    (you can use <span class="command"><strong>mirabait</strong></span> for this) and assemble the
	    remaining reads.
	  </p></li></ol></div></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_bitsandpieces"></a>Chapter 14. Bits and pieces</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_bap_using_ssaha2_smalt_to_screen_for_vector_sequence">14.1. 
      Using SSAHA2 / SMALT to screen for vector sequence
    </a></span></dt></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">Just when you think it's finally settled, it isn't.
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top"><p>
  The documentation of MIRA 3.9.x has not completely caught up yet with the changes introduced by MIRA now using manifest files. Quite a number of recipes still show the old command-line style, e.g.:
  </p><pre class="screen">
mira --project=... --job=... ...</pre><p>
    For those cases, please refer to chapter 3 (the reference) for how to write manifest files.
  </p></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_bap_using_ssaha2_smalt_to_screen_for_vector_sequence"></a>14.1. 
      Using SSAHA2 / SMALT to screen for vector sequence
    </h2></div></div></div><p>
      If your sequencing provider gave you data which was NOT pre-clipped for
      vector sequence, you can do this yourself in a pretty robust manner
      using SSAHA2 -- or the successor, SMALT -- from the Sanger Centre. You
      just need to know which sequencing vector the provider used and have its
      sequence in FASTA format (ask your provider).
    </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      This screening is a valid method for any type of Sanger sequencing
      vectors, 454 adaptors, Illumina adaptors and paired-end adaptors
      etc. However, you probably want to use it only for Sanger type data as
      MIRA already knows all standard 454, Ion Torrent and Illumina adaptors.
    </td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      SSAHA2 and SMALT need their input data to be in FASTA format, so for
      these to run you will need them also in FASTA format. For MIRA however
      you can load your original data in whatever format it was present.
    </td></tr></table></div><p>
      For SSAHA2 follow these steps (most are the same as in the example
      above):
    </p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>ssaha2 -output ssaha2
  -kmer 8 -skip 1 -seeds 1 -score 12 -cmatch 9 -ckmer 6
  /path/where/the/vector/data/resides/vector.fasta
  <em class="replaceable"><code>yourinputsequences.fasta</code></em> &gt; <em class="replaceable"><code>screendataforyoursequences.ssaha2</code></em></code></strong></pre><p>
      Then, in your manifest file, add the following line in the readgroup
      which contains the sequences you screened:
    </p><pre class="screen">
<strong class="userinput"><code>readgroup
...
data = <em class="replaceable"><code>yourinputsequences_inwhateverformat_thisexamplehasfastq.fastq</code></em>
data = <em class="replaceable"><code>screendataforyoursequences.ssaha2</code></em>
...</code></strong></pre><p>
      For SMALT, the only difference is that you use SMALT for generating the
      vector-screen file and ask SMALT to generate it in SSAHA2 format. As
      SMALT works in two steps (indexing and then mapping), you also need to
      perform it in two steps and then call MIRA. E.g.:
    </p><pre class="screen">
<code class="prompt">$</code> <strong class="userinput"><code>smalt index -k 7 -s 1 smaltidxdb /path/where/the/vector/data/resides/vector.fasta</code></strong>
<code class="prompt">$</code> <strong class="userinput"><code>smalt map -f ssaha -d -1 -m 7 smaltidxdb <em class="replaceable"><code>yourinputsequences.fasta</code></em> &gt; <em class="replaceable"><code>screendataforyoursequences.smalt</code></em></code></strong></pre><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      Please note that, due to subtle differences between output of SSAHA2 (in
      ssaha2 format) and SMALT (in ssaha2 format), MIRA identifies the source
      of the screening (and the parsing method it needs) by the name of the
      screen file. Therefore, screens done with SSAHA2 need to have the
      postfix <code class="filename">.ssaha2</code> in the file name and screens done
      with SMALT need
      <code class="filename">*.smalt</code>.
    </td></tr></table></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_faq"></a>Chapter 15. Frequently asked questions</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect_faq_assembly_quality">15.1. 
      Assembly quality
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_faq_what_is_the_effect_of_uniform_read_distribution_as:urd?">15.1.1. 
	What is the effect of uniform read distribution (-AS:urd)?
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_there_are_too_many_contig_debris_when_using_uniform_read_distribution_how_do_i_filter_for_good_contigs?">15.1.2. 
	There are too many contig debris when using uniform read distribution, how do I filter for "good" contigs?
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_when_finishing_which_places_should_i_have_a_look_at?">15.1.3. 
	When finishing, which places should I have a look at?
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_faq_454_data">15.2. 
      454 data
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_faq_what_do_i_need_sffs_for?">15.2.1. 
	What do I need SFFs for?
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_what's_sff_extract_and_where_do_i_get_it?">15.2.2. 
	What's sff_extract and where do I get it?
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_do_i_need_the_sfftools_from_the_roche_software_package?">15.2.3. 
	Do I need the sfftools from the Roche software package?
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_combining_sffs">15.2.4. 
	Combining SFFs
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_adaptors_and_pairedend_linker_sequences">15.2.5. 
	Adaptors and paired-end linker sequences
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_what_do_i_get_in_pairedend_sequencing?">15.2.6. 
	What do I get in paired-end sequencing?
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_sequencing_protocol">15.2.7. 
	Sequencing protocol
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_filtering_by_seqlen">15.2.8. 
	Filtering sequences by length and re-assembly
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_faq_solexa___illumina_data">15.3. 
      Solexa / Illumina data
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_faq_can_i_see_deletions?">15.3.1. 
	Can I see deletions?
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_can_i_see_insertions?">15.3.2. 
	Can I see insertions?
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_denovo_assembly_with_solexa_data">15.3.3. 
	De-novo assembly with Solexa data
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_faq_hybrid_assemblies">15.4. 
      Hybrid assemblies
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_faq_what_are_hybrid_assemblies?">15.4.1. 
	What are hybrid assemblies?
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_what_differences_are_there_in_hybrid_assembly_strategies?">15.4.2. 
	What differences are there in hybrid assembly strategies?
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_faq_masking">15.5. 
      Masking
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_faq_should_i_mask?">15.5.1. 
	Should I mask?
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_how_can_i_apply_custom_masking?">15.5.2. 
	How can I apply custom masking?
      </a></span></dt></dl></dd><dt><span class="sect1"><a href="#sect_faq_miscellaneous">15.6. 
      Miscellaneous
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_faq_what_are_megahubs?">15.6.1. 
	What are megahubs?
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_passes_and_loops">15.6.2. 
	Passes and loops
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_debris">15.6.3. 
	Debris
      </a></span></dt><dt><span class="sect2"><a href="#sect_faq_tmpf_files:_more_info_on_what_happened_during_the_assembly">15.6.4. 
	Log and temporary files: more info on what happened during the assembly
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect_faq_sequence_clipping_after_load">15.6.4.1. 
	  Sequence clipping after load
	</a></span></dt></dl></dd></dl></dd><dt><span class="sect1"><a href="#sect_faq_platforms_and_compiling">15.7. 
      Platforms and Compiling
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect_faq_windows">15.7.1. 
	Windows
      </a></span></dt></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">Every question defines its own answer. Except perhaps 'Why a duck?'
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top"><p>
  The documentation of MIRA 3.9.x has not completely caught up yet with the changes introduced by MIRA now using manifest files. Quite a number of recipes still show the old command-line style, e.g.:
  </p><pre class="screen">
mira --project=... --job=... ...</pre><p>
    For those cases, please refer to chapter 3 (the reference) for how to write manifest files.
  </p></td></tr></table></div><p>
    This list is a collection of frequently asked questions and answers
    regarding different aspects of the MIRA assembler.
  </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
    This document needs to be overhauled.
  </td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_faq_assembly_quality"></a>15.1. 
      Assembly quality
    </h2></div></div></div><div class="qandaset"><a name="idp42837568"></a><dl><dt>15.1.1. <a href="#idp40031104">Test question 1</a></dt><dt>15.1.2. <a href="#idp40207008">Test question 2</a></dt></dl><table border="0" style="width: 100%;"><colgroup><col align="left" width="1%"><col></colgroup><tbody><tr class="question"><td align="left" valign="top"><a name="idp40031104"></a><a name="idp40127568"></a><p><b>15.1.1.</b></p></td><td align="left" valign="top"><p>Test question 1</p></td></tr><tr class="answer"><td align="left" valign="top"></td><td align="left" valign="top"><p>Test answer 1</p></td></tr><tr class="question"><td align="left" valign="top"><a name="idp40207008"></a><a name="idp40826864"></a><p><b>15.1.2.</b></p></td><td align="left" valign="top"><p>Test question 2</p></td></tr><tr class="answer"><td align="left" valign="top"></td><td align="left" valign="top"><p>Test answer 2</p></td></tr></tbody></table></div><p>
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_what_is_the_effect_of_uniform_read_distribution_as:urd?"></a>15.1.1. 
	What is the effect of uniform read distribution (-AS:urd)?
      </h3></div></div></div><p>
      </p><pre class="screen">
	I have a project which I once started quite normally via
	"--job=denovo,genome,accurate,454"
	and once with explicitly switching off the uniform read distribution
	"--job=denovo,genome,accurate,454 -AS:urd=no"
	I get less contigs in the second case and I wonder if that is not better.
	Can you please explain?
      </pre><p>
      </p><p>
	Since 2.9.24x1, MIRA has a feature called "uniform read distribution" which is
	normally switched on. This feature reduces over-compression of repeats during
	the contig building phase and makes sure that, e.g., a rRNA stretch which is
	present 10 times in a bacterium will also be present approximately 10 times in
	your result files.
      </p><p>
	It works a bit like this: under the assumption that reads in a project are
	uniformly distributed across the genome, MIRA will enforce an average coverage
	and temporarily reject reads from a contig when this average coverage
	multiplied by a safety factor is reached at a given site.
      </p><p>
	It's generally a very useful tool disentangle repeats, but has some slight
	secondary effects: rejection of otherwise perfectly good reads. The
	assumption of read distribution uniformity is the big problem we have here:
	of course it's not really valid. You sometimes have less, and sometimes more
	than "the average" coverage. Furthermore, the new sequencing technologies -
	454 perhaps but especially the microreads from Solexa &amp; probably also SOLiD -
	show that you also have a skew towards the site of replication origin.
      </p><p>
	One example: let's assume the average coverage of your project is 8 and by
	chance at one place you have 17 (non-repetitive) reads, then the following
	happens:
      </p><p>
	$p$= parameter of -AS:urdsip
      </p><p>
	Pass 1 to $p-1$: MIRA happily assembles everything together and calculates a
	number of different things, amongst them an average coverage of ~8. At the
	end of pass '$p-1$', it will announce this average coverage as first estimate
	to the assembly process.
      </p><p>
	Pass $p$: MIRA has still assembled everything together, but at the end of each
	pass the contig self-checking algorithms now include an "average coverage
	check". They'll invariably find the 17 reads stacked and decide (looking at
	the -AS:urdct parameter which I now assume to be 2) that 17 is larger than
	2*8 and that this very well may be a repeat. The reads get flagged as
	possible repeats.
      </p><p>
	Pass $p+1$ to end: the "possibly repetitive" reads get a much tougher
	treatment in MIRA. Amongst other things, when building the contig, the contig
	now looks that "possibly repetitive" reads do not over-stack by an average
	coverage multiplied by a safety value (-AS:urdcm) which I'll assume in this
	example to be 1.5. So, at a certain point, say when read 14 or 15 of
	that possible repeat want to be aligned to the contig at this given place, the
	contig will just flatly refuse and tell the assembler to please find another
	place for them, be it in this contig that is built or any other that will
	follow. Of course, if the assembler cannot comply, the reads 14 to 17 will end
	up as contiglet (contig debris, if you want) or if it was only one read that
	got rejected like this, it will end up as singlet or in the debris file.
      </p><p>
	Tough luck. I do have ideas on how to re-integrate those reads at the and of an
	assembly, but I had deferred doing this as in every case I had looked up,
	adding those reads to the contigs wouldn't have changed anything ... there's
	already enough coverage. What I do in those cases is simply filter away the
	contiglets (defined as being of small size and having an average coverage
	below the average coverage of the project / 3 (or 2.5)) from a project.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_there_are_too_many_contig_debris_when_using_uniform_read_distribution_how_do_i_filter_for_good_contigs?"></a>15.1.2. 
	There are too many contig debris when using uniform read distribution, how do I filter for "good" contigs?
      </h3></div></div></div><p>
      </p><pre class="screen">
	When using uniform read distribution there are too many contig with low
	coverage which I don't want to integrate by hand in the finishing process. How
	do I filter for "good" contigs?
      </pre><p>
      </p><p>
	OK, let's get rid of the cruft. It's easy, really: you just need to look up
	one number, take two decisions and then launch a command.
      </p><p>
	The first decision you need to take is on the minimum average coverage the
	contigs you want to keep should have. Have a look at the file
	<code class="filename">*_info_assembly.txt</code> which is in the info directory after
	assembly. In the "Large contigs" section, there's a "Coverage assessment"
	subsection. It looks a bit like this:
      </p><pre class="screen">
	...
	Coverage assessment:
	--------------------
	Max coverage (total): 43
	Max coverage
	Sanger: 0
	454:    43
	Solexa: 0
	Solid:  0
	Avg. total coverage (size &#8805; 5000): 22.30
	Avg. coverage (contig size &#8805; 5000)
	Sanger: 0.00
	454:    22.05
	Solexa: 0.00
	Solid:  0.00
	...
      </pre><p>
      </p><p>
	This project was obviously a 454 only project, and the average coverage for it
	is ~22. This number was estimated by MIRA by taking only contigs of at least
	5Kb into account, which for sure left out everything which could be
	categorised as debris. It's a pretty solid number.
      </p><p>
	Now, depending on how much time you want to invest performing some manual
	polishing, you should extract contigs which have at least the following
	fraction of the average coverage:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    2/3 if a quick and "good enough" is what you want and you don't want to
	    do some manual polishing. In this example, that would be around 14 or 15.
	  </p></li><li class="listitem"><p>
	    1/2 if you want to have a "quick look" and eventually perform some
	    contig joins. In this example the number would be 11.
	  </p></li><li class="listitem"><p>
	    1/3 if you want quite accurate and for sure not loose any possible
	    repeat. That would be 7 or 8 in this example.
	  </p></li></ul></div><p>
      </p><p>
	The second decision you need to take is on the minimum length your contigs
	should have. This decision is a bit dependent on the sequencing technology you
	used (the read length). The following are some rules of thumb:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    Sanger: 1000 to 2000
	  </p></li><li class="listitem"><p>
	    454 GS20: 500
	  </p></li><li class="listitem"><p>
	    454 FLX: 1000
	  </p></li><li class="listitem"><p>
	    454 Titanium: 1500
	  </p></li></ul></div><p>
      </p><p>
	Let's assume we decide for an average coverage of 11 and a minimum length of
	1000 bases. Now you can filter your project with miraconvert
      </p><pre class="screen">
	miraconvert -x 1000 -y 11 sourcefile.caf filtered.caf
      </pre><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_when_finishing_which_places_should_i_have_a_look_at?"></a>15.1.3. 
	When finishing, which places should I have a look at?
      </h3></div></div></div><p>
      </p><pre class="screen">
	I would like to find those places where MIRA wasn't sure and give it a quick
	shot. Where do I need to search?
      </pre><p>
      </p><p>
	Search for the following tags in gap4 or any other finishing program
	for finding places of importance (in this order).
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    IUPc
	  </p></li><li class="listitem"><p>
	    UNSc
	  </p></li><li class="listitem"><p>
	    SRMc
	  </p></li><li class="listitem"><p>
	    WRMc
	  </p></li><li class="listitem"><p>
	    STMU (only hybrid assemblies)
	  </p></li><li class="listitem"><p>
	    STMS (only hybrid assemblies)
	  </p></li></ul></div><p>
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_faq_454_data"></a>15.2. 
      454 data
    </h2></div></div></div><div class="qandaset"><a name="idp51944320"></a><dl><dt>15.2.1. <a href="#idp51944608">What are little boys made of?</a></dt><dt>15.2.2. <a href="#idp51946880">What are little girls made of?</a></dt></dl><table border="0" style="width: 100%;"><colgroup><col align="left" width="1%"><col></colgroup><tbody><tr class="question"><td align="left" valign="top"><a name="idp51944608"></a><a name="idp51944896"></a><p><b>15.2.1.</b></p></td><td align="left" valign="top"><p>What are little boys made of?</p></td></tr><tr class="answer"><td align="left" valign="top"></td><td align="left" valign="top"><p>Snips and snails and puppy dog tails.</p></td></tr><tr class="question"><td align="left" valign="top"><a name="idp51946880"></a><a name="idp51947168"></a><p><b>15.2.2.</b></p></td><td align="left" valign="top"><p>What are little girls made of?</p></td></tr><tr class="answer"><td align="left" valign="top"></td><td align="left" valign="top"><p>Sugar and spice and everything nice.</p></td></tr></tbody></table></div><p>
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_what_do_i_need_sffs_for?"></a>15.2.1. 
	What do I need SFFs for?
      </h3></div></div></div><p>
      </p><pre class="screen">
	I need the .sff files for MIRA to load ...
      </pre><p>
      </p><p>
	Nope, you don't, but it's a common misconception. MIRA does not load SFF
	files, it loads FASTA, FASTA qualities, FASTQ, XML, CAF, EXP and PHD. The
	reason why one should start from the SFF is: those files can be used to create
	a XML file in TRACEINFO format. This XML contains the absolutely vital
	information regarding clipping information of the 454 adaptors (the sequencing
	vector of 454, if you want).
      </p><p>
	For 454 projects, MIRA will then load the FASTA, FASTA quality and the
	corresponding XML. Or from CAF, if you have your data in CAF format.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_what's_sff_extract_and_where_do_i_get_it?"></a>15.2.2. 
	What's sff_extract and where do I get it?
      </h3></div></div></div><p>
      </p><pre class="screen">
	How do I extract the sequence, quality and other values from SFFs?
      </pre><p>
      </p><p>
	Use the <span class="command"><strong>sff_extract</strong></span> script from Jose Blanca at the
	University of Valencia to extract everything you need from the SFF
	files (sequence, qualities and ancillary information). The home of
	sff_extract is: <a class="ulink" href="http://bioinf.comav.upv.es/sff_extract/index.html" target="_top">http://bioinf.comav.upv.es/sff_extract/index.html</a> but I am
	thankful to Jose for giving permission to distribute the script in the
	MIRA 3rd party package (separate download).
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_do_i_need_the_sfftools_from_the_roche_software_package?"></a>15.2.3. 
	Do I need the sfftools from the Roche software package?
      </h3></div></div></div><p>
	No, not anymore. Use the <span class="command"><strong>sff_extract</strong></span> script to
	extract your reads. Though the Roche sfftools package contains a few
	additional utilities which could be useful.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_combining_sffs"></a>15.2.4. 
	Combining SFFs
      </h3></div></div></div><p>
      </p><pre class="screen">
	I am trying to use MIRA to assemble reads obtained with the 454 technology
	but I can't combine my sff files since I have two files obtained with GS20
	system and 2 others obtained with the GS-FLX system. Since they use
	different cycles (42 and 100) I can't use the sfffile to combine both.
      </pre><p>
      </p><p>
	You do not need to combine SFFs before translating them into something
	MIRA (or other software tools) understands. Use
	<span class="command"><strong>sff_extract</strong></span> which extracts data from the SFF files
	and combines this into input files.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_adaptors_and_pairedend_linker_sequences"></a>15.2.5. 
	Adaptors and paired-end linker sequences
      </h3></div></div></div><p>
      </p><pre class="screen">
	I have no idea about the adaptor and the linker sequences, could you send me
	the sequences please?
      </pre><p>
      </p><p>
	Here are the sequences as filed by 454 in their patent application:
      </p><pre class="screen">
	&gt;AdaptorA
	CTGAGACAGGGAGGGAACAGATGGGACACGCAGGGATGAGATGG
	&gt;AdaptorB
	CTGAGACACGCAACAGGGGATAGGCAAGGCACACAGGGGATAGG
      </pre><p>
      </p><p>
	However, looking through some earlier project data I had, I also retrieved the
	following (by simply making a consensus of sequences that did not match the
	target genome anymore):
      </p><pre class="screen">
	&gt;5prime454adaptor???
	GCCTCCCTCGCGCCATCAGATCGTAGGCACCTGAAA
	&gt;3prime454adaptor???
	GCCTTGCCAGCCCGCTCAGATTGATGGTGCCTACAG
      </pre><p>
      </p><p>
	Go figure, I have absolutely no idea where these come from as they also do not
	comply to the "tcag" ending the adaptors should have.
      </p><p>
	I currently know one linker sequence (454/Roche also calls it <span class="emphasis"><em>spacer</em></span>
	for GS20 and FLX paired-end sequencing:
      </p><pre class="screen">
	&gt;flxlinker
	GTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC
      </pre><p>
      </p><p>
	For Titanium data using standard Roche protocol, you need to screen for two
	linker sequences:
      </p><pre class="screen">
	&gt;titlinker1
	TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG
	&gt;titlinker2
	CGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA
      </pre><p>
      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	Some sequencing labs modify the adaptor sequences for tagging and
	similar things. Ask your sequencing provider for the exact adaptor
	and/or linker sequences.
      </td></tr></table></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_what_do_i_get_in_pairedend_sequencing?"></a>15.2.6. 
	What do I get in paired-end sequencing?
      </h3></div></div></div><p>
      </p><pre class="screen">
	Another question I have is does the read pair sequences have further
	adaptors/vectors in the forward and reverse strands?
      </pre><p>
      </p><p>
	Like for normal 454 reads - the normal A and B adaptors can be present
	in paired-end reads. That theory this could could look like this:
      </p><p>
	A-Adaptor - DNA1 - Linker - DNA2 - B-Adaptor.
      </p><p>
	It's possible that one of the two DNA fragments is *very* short or is missing
	completely, then one has something like this:
      </p><p>
	A-Adaptor - DNA1 - Linker - B-Adaptor
      </p><p>
	or
      </p><p>
	A-Adaptor - Linker - DNA2 - B-Adaptor
      </p><p>
	And then there are all intermediate possibilities with the read not having one
	of the two adaptors (or both). Though it appears that the majority of reads
	will contain the following:
      </p><p>
	DNA1 - Linker - DNA2
      </p><p>
	There is one caveat: according to current paired-end protocols, the sequences
	will <span class="bold"><strong>NOT</strong></span> have the direction
      </p><pre class="screen">
	---&gt; Linker &lt;---
      </pre><p>
	as one might expect when being used to Sanger Sequencing, but rather in this
	direction
      </p><pre class="screen">
	&lt;--- Linker ---&gt;
      </pre><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_sequencing_protocol"></a>15.2.7. 
	Sequencing protocol
      </h3></div></div></div><p>
      </p><pre class="screen">
	Is there a way I can find out which protocol was used?
      </pre><p>
      </p><p>
	Yes. The best thing to do is obviously to ask your sequencing provider.
      </p><p>
	If this is - for whatever reason - not possible, this list might help.
      </p><p>
	Are the sequences ~100-110 bases long? It's GS20.
      </p><p>
	Are the sequences ~220-250 bases long? It's FLX.
      </p><p>
	Are the sequences ~350-450 bases long? It's Titanium.
      </p><p>
	Do the sequences contain a linker
	(GTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC)? It's a paired end protocol.
      </p><p>
	If the sequences left and right of the linker are ~29bp, it's the old short
	paired end (SPET, also it's most probably from a GS20). If longer, it's long
	paired-end (LPET, from a FLX).
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_filtering_by_seqlen"></a>15.2.8. 
	Filtering sequences by length and re-assembly
      </h3></div></div></div><pre class="screen">
I have two datasets of ~500K sequences each and the sequencing company
already did an assembly (using MIRA) on the basecalled and fully processed
reads (using of course the accompanying *qual file). Do you suggest that I
should redo the assembly after filtering out sequences being shorter than a
certain length (e.g. those that are &lt;200bp)? In other words, am I taking into
account low quality sequences if I do the assembly the way the sequencing
company did it (fully processed reads + quality files)?
      </pre><p>
	I don't think that filtering out "shorter" reads will bring much
	positive improvement. If the sequencing company used the standard
	Roche/454 pipeline, the cut-offs for quality are already quite good,
	remaining sequences should be, even when being &lt; 200bp, not of bad
	quality, simply a bit shorter.
      </p><p>
	Worse, you might even introduce a bias when filtering out short
	sequences: chemistry and library construction being what they are
	(rather imprecise and sometimes problematic), some parts of DNA/RNA
	yield smaller sequences per se ... and filtering those out might not
	be the best move.
      </p><p>
	You might consider doing an assembly if the company used a rather old
	version of MIRA (&lt;3.0.0 for sure, perhaps also &lt;3.0.5).
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_faq_solexa___illumina_data"></a>15.3. 
      Solexa / Illumina data
    </h2></div></div></div><p>
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_can_i_see_deletions?"></a>15.3.1. 
	Can I see deletions?
      </h3></div></div></div><p>
      </p><pre class="screen">
	Suppose you ran the genome of a strain that had one or more large
	deletions. Would it be clear from the data that a deletion had occurred?
      </pre><p>
      </p><p>
	In the question above, I assume you'd compare your strain <span class="emphasis"><em>X</em></span> to a strain
	<span class="emphasis"><em>Ref</em></span> and that <span class="emphasis"><em>X</em></span> had deletions compared to
	<span class="emphasis"><em>Ref</em></span>. Furthermore, I base my answer on data sets I have seen, which
	presently were 36 and 76 mers, paired and unpaired.
      </p><p>
	Yes, this would be clear. And it's a piece of cake with MIRA.
      </p><p>
	Short deletions (1 to 10 bases): they'll be tagged SROc or WRMc.
	General rule: deletions of up to 10 to 12% of the length of your read should
	be found and tagged without problem by MIRA, above that it may or may not,
	depending a bit on coverage, indel distribution and luck.
      </p><p>
	Long deletions (longer than read length): they'll be tagged with MCVc tag by
	MIRA ins the consensus. Additionally, when looking at the FASTA files when
	running the CAF result through miraconvert: long stretches of
	sequences without coverage (the @ sign in the FASTAs) of <span class="emphasis"><em>X</em></span> show missing
	genomic DNA.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_can_i_see_insertions?"></a>15.3.2. 
	Can I see insertions?
      </h3></div></div></div><p>
      </p><pre class="screen">
	Suppose you ran the genome of a strain X that had a plasmid missing from the
	reference sequence. Alternatively, suppose you ran a strain that had picked
	up a prophage or mobile element lacking in the reference. Would that
	situation be clear from the data?
      </pre><p>
      </p><p>
	Short insertions (1 to 10 bases): they'll be tagged SROc or WRMc.
	General rule: deletions of up to 10 to 12% of the length of your read should
	be found and tagged without problem by MIRA, above that it may or may not,
	depending a bit on coverage, indel distribution and luck.
      </p><p>
	Long insertions: it's a bit more work than for deletions. But if you ran a
	de-novo assembly on all reads not mapped against your reference sequence,
	chances are good you'd get good chunks of the additional DNA put together
      </p><p>
	Once the Solexa paired-end protocol is completely rolled out and used on a
	regular base, you would even be able to place the additional element into the
	genome (approximately).
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_denovo_assembly_with_solexa_data"></a>15.3.3. 
	De-novo assembly with Solexa data
      </h3></div></div></div><p>
      </p><pre class="screen">
	Any chance you could assemble de-novo the sequence of a from just the Solexa
	data?
      </pre><p>
      </p><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
	Highly opinionated answer ahead, your mileage may vary.
      </td></tr></table></div><p>
	Allow me to make a clear statement on this: maybe.
      </p><p>
	But the result would probably be nothing I would call a good
	assembly. If you used anything below 76mers, I'm highly sceptical
	towards the idea of de-novo assembly with Solexa (or ABI SOLiD) reads
	that are in the 30 to 50bp range. They're really too short for that,
	even paired end won't help you much (especially if you have library
	sizes of just 200 or 500bp). Yes, there are papers describing
	different draft assemblers (SHARCGS, EDENA, Velvet, Euler and others),
	but at the moment the results are less than thrilling to me.
      </p><p>
	If a sequencing provider came to me with N50 numbers for an
	<span class="emphasis"><em>assembled genome</em></span> in the 5-8 Kb range, I'd laugh
	him in the face. Or weep. I wouldn't dare to call this even
	'draft'. I'd just call it junk.
      </p><p>
	On the other hand, this could be enough for some purposes like, e.g.,
	getting a quick overview on the genetic baggage of a bug. Just don't
	expect a finished genome.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_faq_hybrid_assemblies"></a>15.4. 
      Hybrid assemblies
    </h2></div></div></div><p>
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_what_are_hybrid_assemblies?"></a>15.4.1. 
	What are hybrid assemblies?
      </h3></div></div></div><p>
	Hybrid assemblies are assemblies where one used more than one sequencing
	technology. E.g.: Sanger and 454, or 454 and Solexa, or Sanger and Solexa
	etc.pp
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_what_differences_are_there_in_hybrid_assembly_strategies?"></a>15.4.2. 
	What differences are there in hybrid assembly strategies?
      </h3></div></div></div><p>
	Basically, one can choose two routes: multi-step or all-in-one-go.
      </p><p>
	Multi-steps means: to assemble reads from one sequencing technology (ideally
	the one from the shorter tech like, e.g., Solexa), fragment the resulting
	contigs into pseudo-reads of the longer tech and assemble these with the real
	reads from the longer tech (like, e.g., 454). The advantage of this approach
	is that it will be probably quite faster than the all-in-one-go approach. The
	disadvantage is that you loose a lot of information when using only consensus
	sequence of the shorter read technology for the final assembly.
      </p><p>
	All-in-one-go means: use all reads in one single assembly. The advantage of
	this is that the resulting alignment will be made of true reads with a maximum
	of information contained to allow a really good finishing. The disadvantage is
	that the assembly will take longer and will need more RAM.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_faq_masking"></a>15.5. 
      Masking
    </h2></div></div></div><p>
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_should_i_mask?"></a>15.5.1. 
	Should I mask?
      </h3></div></div></div><p>
      </p><pre class="screen">
	In EST projects, do you think that the highly repetitive option will get rid
	of the repetitive sequences without going to the step of repeat masking?
      </pre><p>
      </p><p>
	For eukaryotes, yes. Please also consult the [-KS:mnr] option.
      </p><p>
	Remember: you still <span class="bold"><strong>MUST</strong></span> have sequencing vectors and adaptors
	clipped! In EST sequences the poly-A tails should be also clipped (or let
	mira do it.
      </p><p>
	For prokaryotes, I´m a big fan of having a first look at unmasked data.
	Just try to start MIRA without masking the data. After something like 30
	minutes, the all-vs-all comparison algorithm should be through with a first
	comparison round. grep the log for the term "megahub" ... if it doesn't
	appear, you probably don't need to mask repeats
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_how_can_i_apply_custom_masking?"></a>15.5.2. 
	How can I apply custom masking?
      </h3></div></div></div><p>
      </p><pre class="screen">
	I want to mask away some sequences in my input. How do I do that?
      </pre><p>
      </p><p>
	First, if you want to have Sanger sequencing vectors (or 454 adaptor
	sequences) "masked", please note that you should rather use ancillary data
	files (CAF, XML or EXP) and use the sequencing or quality clip options there.
      </p><p>
	Second, please make sure you have read and understood the documentation for all
	-CL parameters in the main manual, but especially -CL:mbc:mbcgs:mbcmfg:mbcmeg
	as you might want to switch it on or off or set different values depending on
	your pipeline and on your sequencing technology.
      </p><p>
	You can without problem mix your normal repeat masking pipeline with the FASTA
	or EXP input for MIRA, as long as you <span class="bold"><strong>mask</strong></span> and not <span class="bold"><strong>clip</strong></span> the
	sequence.
      </p><p>
	An example:
      </p><pre class="screen">
	&gt;E09238ARF0
	tcag GTGTCAGTGTTGACTGTAAAAAAAAAGTACGTATGGACTGCATGTGCATGTCATGGTACGTGTCA
	GTCAGTACAAAAAAAAAAAAAAAAAAAAGTACGT tgctgacgcacatgatcgtagc
      </pre><p>
      </p><p>
	(spaces inserted just as visual helper in the example sequence, they would not
	occur in the real stuff)
      </p><p>
	The XML will contain the following clippings:
	left clip = 4    (clipping away the "tcag" which are the last four bases of the
	adaptor used by Roche)
	right clip= ~90  (clipping away the "tgctgac..." lower case sequence on the
	right side of the sequence above.
      </p><p>
	Now, on the FASTA file that was generated with reads_sff.py or with the Roche
	sff* tools, you can let run, e.g., a repeat masker. The result could look like
	this:
      </p><pre class="screen">
	&gt;E09238ARF0
	tcag XXXXXXXXX TTGACTGTAAAAAAAAAGTACGTATGGACTGCATGTGCATGTCATGGTACGTGTCA
	GTCAGTACAAAAAAAAAAAAAAAAAAAAGTACGT tgctgacgcacatgatcgtagc
      </pre><p>
      </p><p>
	The part with the Xs was masked away by your repeat masker. Now, when MIRA
	loads the FASTA, it will first apply the clippings from the XML file (they're
	still the same). Then, if the option to clip away masked areas of a read
	(-CL:mbc, which is normally on for EST projects), it will search for the
	stretches of X and internally also put clips to the sequence. In the example
	above, only the following sequence would remain as "working sequence" (the
	clipped parts would still be present, but not used for any computation.
      </p><pre class="screen">
	&gt;E09238ARF0
	...............TTGACTGTAAAAAAAAAGTACGTATGGACTGCATGTGCATGTCATGGTACGTGTCA
	GTCAGTACAAAAAAAAAAAAAAAAAAAAGTACGT........................
      </pre><p>
      </p><p>
	Here you can also see the reason why your filters should <span class="bold"><strong>mask</strong></span> and not
	clip the sequence. If you change the length of the sequence, the clips in the
	XML would not be correct anymore, wrong clippings would be made, wrong
	sequence reconstructed, chaos ensues and the world would ultimately end. Or
	something.
      </p><p>
	<span class="bold"><strong>IMPORTANT!</strong></span> It might be that you do not want MIRA to merge the masked
	part of your sequence with a left or right clip, but that you want to keep it
	something like DNA - masked part - DNA. In this case, consult the manual for
	the -CL:mbc switch, either switch it off or set adequate options for the
	boundaries and gap sizes.
      </p><p>
	Now, if you look at the sequence above, you will see two possible poly-A
	tails ... at least the real poly-A tail should be masked else you will get
	megahubs with all the other reads having the poly-A tail.
      </p><p>
	You have two possibilities: you mask yourself with an own program or you let
	MIRA do the job (-CL:cpat, which should normally be on for EST projects but I
	forgot to set the correct switch in the versions prior to 2.9.26x3, so you
	need to set it manually for 454 EST projects there).
      </p><p>
	<span class="bold"><strong>IMPORTANT!</strong></span> Never ever at all use two poly-A tail masker (an own and
	the one from MIRA): you would risk to mask too much. Example: assume the above
	read you masked with a poly-A masker. The result would very probably look like
	this:
      </p><pre class="screen">
	&gt;E09238ARF0
	tcag XXXXXXXXX TTGACTGTAAAAAAAAAGTACGTATGGACTGCATGTGCATGTCATGGTACGTGTCA
	GTCAGTAC XXXXXXXXXXXXXXXXXXXX GTACGT tgctgacgcacatgatcgtagc
      </pre><p>
      </p><p>
	And MIRA would internally make the following out of it after loading:
      </p><pre class="screen">
	&gt;E09238ARF0
	...............TTGACTGTAAAAAAAAAGTACGTATGGACTGCATGTGCATGTCATGGTACGTGTCA
	GTCAGTAC..................................................
      </pre><p>
      </p><p>
	and then apply the internal poly-A tail masker:
      </p><pre class="screen">
	&gt;E09238ARF0
	...............TTGACTGT................................................
	..........................................................
      </pre><p>
      </p><p>
	You'd be left with ... well, a fragment of your sequence.
      </p></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_faq_miscellaneous"></a>15.6. 
      Miscellaneous
    </h2></div></div></div><p>
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_what_are_megahubs?"></a>15.6.1. 
	What are megahubs?
      </h3></div></div></div><p>
      </p><pre class="screen">
	I looked in the log file and that term "megahub" you told me about appears
	pretty much everywhere. First of all, what does it mean?
      </pre><p>
      </p><p>
	Megahub is the internal term for MIRA that the read is massively repetitive
	with respect to the other reads of the projects, i.e., a read that is a
	megahub connects to an insane number of other reads.
      </p><p>
	This is a clear sign that something is wrong. Or that you have a quite
	repetitive eukaryote. But most of the time it's sequencing vectors
	(Sanger), A and B adaptors or paired-end linkers (454), unmasked
	poly-A signals (EST) or non-normalised EST libraries which contain
	high amounts of housekeeping genes (always the same or nearly the
	same).
      </p><p>
	Countermeasures to take are:
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    set clips for the sequencing vectors (Sanger) or Adaptors (454)
	    either in the XML or EXP files
	  </p></li><li class="listitem"><p>
	    for ESTs, mask poly-A in your input data (or let MIRA do it with the
	    -CL:cpat parameter)
	  </p></li><li class="listitem"><p>
	    only after the above steps have been made, use
	    the [-KS:mnr] switch to let mira automatically mask nasty
	    repeats, adjust the threshold with  [-SK:rt].
	  </p></li><li class="listitem"><p>
	    if everything else fails, filter out or mask sequences yourself in the
	    input data that come from housekeeping genes or nasty repeats.
	  </p></li></ul></div><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_passes_and_loops"></a>15.6.2. 
	Passes and loops
      </h3></div></div></div><p>
      </p><pre class="screen">
	While processing some contigs with repeats i get
	"Accepting probably misassembled contig because of too many iterations."
	What is this?
      </pre><p>
      </p><p>
	That's quite normal in the first few passes of an assembly. During each pass
	(-AS:nop), contigs get built one by one. After a contig has been finished, it
	checks itself whether it can find misassemblies due to repeats (and marks
	these internally). If no misassembly, perfect, build next contig. But if yes,
	the contig requests immediate re-assembly of itself.
      </p><p>
	But this can happen only a limited number of times (governed by -AS:rbl). If
	there are still misassemblies, the contig is stored away anyway ... chances
	are good that in the next full pass of the assembler, enough knowledge has
	been gained top correctly place the reads.
      </p><p>
	So, you need to worry only if these messages still appear during the last
	pass. The positions that cause this are marked with "SRMc" tags in the
	assemblies (CAF, ACE in the result dir; and some files in the info dir).
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_debris"></a>15.6.3. 
	Debris
      </h3></div></div></div><p>
      </p><pre class="screen">
	What are the debris composed of?
      </pre><p>
      </p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	    sequences too short (after trimming)
	  </p></li><li class="listitem"><p>
	    megahubs
	  </p></li><li class="listitem"><p>
	    sequences almost completely masked by the nasty repeat masker
	    ([-KS:mnr])
	  </p></li><li class="listitem"><p>
	    singlets, i.e., reads that after an assembly pass did not align
	    into any contig (or where rejected from every contig).
	  </p></li><li class="listitem"><p>
	    sequences that form a contig with less reads than defined by
	    [-AS:mrpc]
	  </p></li></ul></div><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_tmpf_files:_more_info_on_what_happened_during_the_assembly"></a>15.6.4. 
	Log and temporary files: more info on what happened during the assembly
      </h3></div></div></div><p>
      </p><pre class="screen">
	I do not understand why ... happened. Is there a way to find out?
      </pre><p>
	Yes. The tmp directory contains, beside temporary data, a number of
	log files with more or less readable information. While development
	versions of MIRA keep this directory after finishing, production
	versions normally delete this directory after an assembly. To keep the
	logs and temporary file also in production versions, use
	"-OUT:rtd=no".
      </p><p>
	As MIRA also tries to save as much disk space as possible, some logs
	and temporary files are rotated (which means that old logs and tmps
	get deleted). To switch off this behaviour, use
	"-OUT:rrot=no". Beware, the size of the tmp directory will increase,
	sometimes dramatically so.
      </p><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect_faq_sequence_clipping_after_load"></a>15.6.4.1. 
	  Sequence clipping after load
	</h4></div></div></div><p>
	  How MIRA clipped the reads after loading them can be found in the file
	  <code class="filename">mira_int_clippings.0.txt</code>. The entries look like this:
	</p><pre class="screen">
	  load:  minleft. U13a01d05.t1    Left: 11         -&gt; 30
	</pre><p>
	  Interpret this as: after loading, the read "U13a01d05.t1" had a left clipping
	  of eleven. The "minleft" clipping option of MIRA did not like it and set it to
	  30.
	</p><pre class="screen">
	  load:  bad seq. gnl|ti|1133527649       Shortened by 89 New right: 484
	</pre><p>
	</p><p>
	  Interpret this as: after loading, the read "gnl|ti|1133527649" was checked
	  with the "bad sequence search" clipping algorithm which determined that there
	  apparently is something dubious, so it shortened the read by 89 bases, setting
	  the new right clip to position 484.
	</p></div></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect_faq_platforms_and_compiling"></a>15.7. 
      Platforms and Compiling
    </h2></div></div></div><p>
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect_faq_windows"></a>15.7.1. 
	Windows
      </h3></div></div></div><p>
      </p><pre class="screen">
	Also, is MIRA be available on a windows platform?
      </pre><p>
      </p><p>
	As a matter of fact: it was and may be again. While I haven't done it myself,
	according to reports I got compiling MIRA 2.9.3* in a Cygwin environment was
	actually painless. But since then BOOST and multi-threading has been included
	and I am not sure whether it is still as easy.
      </p><p>
	I'd be thankful for reports :-)
      </p></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_maf"></a>Chapter 16. The MAF format</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect1_introduction:_why_an_own_assembly_format?">16.1. 
      Introduction: why an own assembly format?
    </a></span></dt><dt><span class="sect1"><a href="#sect1_the_maf_format">16.2. 
      The MAF format
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect2_basics">16.2.1. 
	Basics
      </a></span></dt><dt><span class="sect2"><a href="#sect2_reads">16.2.2. 
	Reads
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect3_simple_example">16.2.2.1. 
	  Simple example
	</a></span></dt><dt><span class="sect3"><a href="#sect3_list_of_records_for_reads">16.2.2.2. 
	  List of records for reads
	</a></span></dt><dt><span class="sect3"><a href="#sect3_interpreting_clipping_values">16.2.2.3. 
	  Interpreting clipping values
	</a></span></dt></dl></dd><dt><span class="sect2"><a href="#sect2_contigs">16.2.3. 
	Contigs
      </a></span></dt><dd><dl><dt><span class="sect3"><a href="#sect3_simple_example_2">16.2.3.1. 
	  Simple example 2
	</a></span></dt><dt><span class="sect3"><a href="#sect3_list_of_records_for_contigs">16.2.3.2. 
	  List of records for contigs
	</a></span></dt></dl></dd></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">Design flaws travel in herds.
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top"><p>
  The documentation of MIRA 3.9.x has not completely caught up yet with the changes introduced by MIRA now using manifest files. Quite a number of recipes still show the old command-line style, e.g.:
  </p><pre class="screen">
mira --project=... --job=... ...</pre><p>
    For those cases, please refer to chapter 3 (the reference) for how to write manifest files.
  </p></td></tr></table></div><p>
    This documents describes purpose and format of the MAF format, version
    1. Which has been superceeded by version 2 but is not described here
    (yet). But as v1 and v2 are very similar only the notion of readgroups is
    a big change, I'll let this description live until I have time to update
    this section.
  </p><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect1_introduction:_why_an_own_assembly_format?"></a>16.1. 
      Introduction: why an own assembly format?
    </h2></div></div></div><p>
      I had been on the hunt for some time for a file format that allow MIRA to
      quickly save and load reads and full assemblies. There are currently a number
      of alignment format files on the market and MIRA can read and/or write most of
      them. Why not take one of these? It turned out that all (well, the ones I
      know: ACE, BAF, CAF, CALF, EXP, FRG) have some kind of no-go 'feature' (or problem
      or bug) that makes one life pretty difficult if one wants to write or parse
      that given file format.
    </p><p>
      What I needed for MIRA was a format that:
    </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	  is easy to parse
	</p></li><li class="listitem"><p>
	  is quick to parse
	</p></li><li class="listitem"><p>
	  contains all needed information of an assembly that MIRA and many
	  finishing programs use: reads (with sequence and qualities) and contigs,
	  tags etc.pp
	</p></li></ol></div><p>
    </p><p>
      MAF is not a format with the smallest possible footprint though it fares quite
      well in comparison to ACE, CAF and EXP), but as it's meant as interchange format,
      it'll do. It can be easily indexed and does not need string lookups during
      parsing.
    </p><p>
      I took the liberty to combine many good ideas from EXP, BAF, CAF and FASTQ
      while defining the format and if anything is badly designed, it's all my
      fault.
    </p></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect1_the_maf_format"></a>16.2. 
      The MAF format
    </h2></div></div></div><p>
      This describes version 1 of the MAF format. If the need arises, enhancements
      like meta-data about total number of contigs and reads will be implemented in the
      next version.
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_basics"></a>16.2.1. 
	Basics
      </h3></div></div></div><p>
	MAF ...
      </p><div class="orderedlist"><ol class="orderedlist" type="1"><li class="listitem"><p>
	    ... has for each record a keyword at the beginning of the line, followed
	    by exactly one blank (a space or a tab), then followed by the values for
	    this record. At the moment keywords are two character keywords, but keywords
	    with other lengths might appear in the future
	  </p></li><li class="listitem"><p>
	    ... is strictly line oriented. Each record is terminated by a newline,
	    no record spans across lines.
	  </p></li></ol></div><p>
      </p><p>
	All coordinates start at 1, i.e., there is no 0 value for coordinates.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_reads"></a>16.2.2. 
	Reads
      </h3></div></div></div><p>
      </p><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect3_simple_example"></a>16.2.2.1. 
	  Simple example
	</h4></div></div></div><p>
	  Here's an example for a simple read, just the read name and the sequence:
	</p><pre class="screen">
	  RD      U13a05e07.t1
	  RS      CTTGCATGCCTGCAGGTCGACTCTAGAAGGACCCCGATCA
	  ER
	</pre><p>
	</p><p>
	  Reads start with RD and end with ER, the RD keyword is always followed by the
	  name of the read, ER stands on its own. Reads also should contain a sequence
	  (RS). Everything else is optional. In the following example, the read has
	  additional quality values (RQ), template definitions (name in TN, minimum and
	  maximum insert size in TF and TT), a pointer to the file with the raw data (SF),
	  a left clip which covers sequencing vector or adaptor sequence (SL), a left
	  clip covering low quality (QL), a right clip covering low quality (QR), a
	  right clip covering sequencing vector or adaptor sequence (SR), alignment to
	  original sequence (AO), a tag (RT) and the sequencing technology it was
	  generated with (ST).
	</p><pre class="screen">
	  RD      U13a05e07.t1
	  RS      CTTGCATGCCTGCAGGTCGACTCTAGAAGGACCCCGATCA
	  RQ      ,-+*,1-+/,36;:6&#8804;3327&lt;7A1/,,).('..7=@E8:
	  TN      U13a05e07
	  DI      F
	  TF      1200
	  TT      1800
	  SF      U13a05e07.t1.scf
	  SL      4
	  QL      7
	  QR      30
	  SR      32
	  AO      1 40 1 40
	  RT      ALUS 10 15 Some comment to this read tag.
	  ST      Sanger
	  ER
	</pre><p>
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect3_list_of_records_for_reads"></a>16.2.2.2. 
	  List of records for reads
	</h4></div></div></div><p>
	</p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	      RD <span class="emphasis"><em>string: readname</em></span>
	    </p><p> RD followed by the read name starts a read.
	    </p></li><li class="listitem"><p>
	      LR <span class="emphasis"><em>integer: read length</em></span>
	    </p><p>
	      The length of the read can be given optionally in LR. This is
	      meant to help the parser perform sanity checks and eventually
	      pre-allocate memory for sequence and quality.
	    </p><p>
	      MIRA at the moment only writes LR lines for reads with more than
	      2000 bases.
	    </p></li><li class="listitem"><p>
	      RS <span class="emphasis"><em>string: DNA sequence</em></span>
	    </p><p> Sequence of a read is stored in RS.
	    </p></li><li class="listitem"><p>
	      RQ <span class="emphasis"><em>string: qualities</em></span>
	    </p><p> Qualities are stored in FASTQ format, i.e., each quality
	    value + 33 is written as single as ASCII character.
	    </p></li><li class="listitem"><p>
	      SV <span class="emphasis"><em>string: sequencing vector</em></span>
	    </p><p> Name of the sequencing vector or
	    adaptor used in this read.
	    </p></li><li class="listitem"><p>
	      TN <span class="emphasis"><em>string: template name</em></span>
	    </p><p> Template name. This defines the DNA template a sequence
	    comes from. In it's simplest form, a DNA template is sequenced
	    only once. In paired-end sequencing, a DNA template is sequenced
	    once in forward and once in reverse direction (Sanger, 454,
	    Solexa). In Sanger sequencing, several forward and/or reverse
	    reads can be sequenced from a DNA template. In PacBio sequencing,
	    a DNA template can be sequenced in several "strobes", leading to
	    multiple reads on a DNA template.
	    </p></li><li class="listitem"><p>
	      DI <span class="emphasis"><em>character: F or R</em></span>
	    </p><p> Direction of the read with respect to the
	    template. F for forward, R for reverse.
	    </p></li><li class="listitem"><p>
	      TF <span class="emphasis"><em>integer: template size from</em></span>
	    </p><p> Minimum estimated
	    size of a sequencing template. In paired-end sequencing, this is the minimum
	    distance of the read pair.
	    </p></li><li class="listitem"><p>
	      TT <span class="emphasis"><em>integer: template size to</em></span>
	    </p><p> Maximum estimated
	    size of a sequencing template. In paired-end sequencing, this is the maximum
	    distance of the read pair.
	    </p></li><li class="listitem"><p>
	      SF <span class="emphasis"><em>string: sequencing file</em></span>
	    </p><p> Name of the sequencing file which
	    contains raw data for this read.
	    </p></li><li class="listitem"><p>
	      SL <span class="emphasis"><em>integer: seqvec left</em></span>
	    </p><p>
	      Clip left due to sequencing vector. Assumed to be 1 if not
	      present. Note that left clip values are excluding, e.g.: a value
	      of '7' clips off the left 6 bases.
	    </p></li><li class="listitem"><p>
	      QL <span class="emphasis"><em>integer: qual left</em></span>
	    </p><p>
	      Clip left due to low quality. Assumed to be 1 if not
	      present. Note that left clip values are excluding, e.g.: a value
	      off '7' clips of the left 6 bases.
	    </p></li><li class="listitem"><p>
	      CL <span class="emphasis"><em>integer: clip left</em></span>
	    </p><p>
	      Clip left (any reason). Assumed to be 1 if not present. Note
	      that left clip values are excluding, e.g.: a value of '7' clips
	      off the left 6 bases.
	    </p></li><li class="listitem"><p>
	      SR <span class="emphasis"><em>integer: seqvec right</em></span>
	    </p><p> Clip right due to sequencing
	    vector. Assumed to be the length of the sequence if not present. Note that
	    right clip values are including, e.g., a value of '10' leaves the bases 1 to
	    9 and clips at and including base 10 and higher.
	    </p></li><li class="listitem"><p>
	      QR <span class="emphasis"><em>integer: qual right</em></span>
	    </p><p> Clip right due to low quality. Assumed
	    to be the length of the sequence if not present. Note that right clip values
	    are including, e.g., a value of '10' leaves the bases 1 to 9 and clips at
	    and including base 10 and higher.
	    </p></li><li class="listitem"><p>
	      CR <span class="emphasis"><em>integer: clip right</em></span>
	    </p><p> Clip right (any reason). Assumed to be
	    the length of the sequence if not present. Note that
	    right clip values are including, e.g., a value of '10' leaves the bases 1 to
	    9 and clips at and including base 10 and higher.
	    </p></li><li class="listitem"><p>
	      AO <span class="emphasis"><em>four integers: x1 y1 x2 y2</em></span>
	    </p><p> AO stands for "Align to
	    Original". The interval [x1 y1] in the read as stored in the MAF file aligns
	    with [x2 y2] in the original, unedited read sequence. This allows to model
	    insertions and deletions in the read and still be able to find the correct
	    position in the original, base-called sequence data.
	    </p><p> A read can have
	    several AO lines which together define all the edits performed to this
	    read.
	    </p><p> Assumed to be "1 x 1 x" if not present, where 'x' is the length of
	    the unclipped sequence.
	    </p></li><li class="listitem"><p>
	      RT <span class="emphasis"><em>string + 2 integers + optional string: type x1 y1 comment</em></span>
	    </p><p> Read tags are given by naming the tag type, which positions
	    in the read the tag spans in the interval [x1 y1] and afterwards
	    optionally a comment. As MAF is strictly line oriented, newline
	    characters in the comment are encoded
	    as <code class="literal">\n</code>.
	    </p><p> If x1 &gt; y1, the tag is in reverse direction.
	    </p><p>
	      The tag type can be a free form string, though MIRA will
	      recognise and work with tag types used by the Staden gap4
	      package (and of course the MIRA tags as described in the main
	      documentation of MIRA).
	    </p></li><li class="listitem"><p>
	      ST <span class="emphasis"><em>string: sequencing technology</em></span>
	    </p><p> The current technologies
	    can be defined: Sanger, 454, Solexa, SOLiD.
	    </p></li><li class="listitem"><p>
	      SN <span class="emphasis"><em>string: strain name</em></span>
	    </p><p> Strain name of the sample that was
	    sequenced, this is a free form string.
	    </p></li><li class="listitem"><p>
	      MT <span class="emphasis"><em>string: machine type</em></span>
	    </p><p> Machine type which generated the data,
	    this is a free form string.
	    </p></li><li class="listitem"><p>
	      BC <span class="emphasis"><em>string: base caller</em></span>
	    </p><p>
	      Base calling program used to call bases
	    </p></li><li class="listitem"><p>
	      IB <span class="emphasis"><em>boolean (0 or 1): is backbone</em></span>
	    </p><p> Whether the read is a backbone. Reads used as reference
	    (backbones) in mapping assemblies get this attribute.
	    </p></li><li class="listitem"><p>
	      IC <span class="emphasis"><em>boolean (0 or 1)</em></span>
	    </p><p> Whether the read is a coverage equivalent
	    read (e.g. from mapping Solexa). This is internal to MIRA.
	    </p></li><li class="listitem"><p>
	      IR <span class="emphasis"><em>boolean (0 or 1)</em></span>
	    </p><p> Whether the read is a rail. This also is
	    internal to MIRA.
	    </p></li><li class="listitem"><p>
	      ER
	    </p><p> This ends a read and is mandatory.
	    </p></li></ul></div><p>
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect3_interpreting_clipping_values"></a>16.2.2.3. 
	  Interpreting clipping values
	</h4></div></div></div><p>
	  Every left and right clipping pair (SL &amp; SR, QL &amp; QR, CL &amp; CR) forms a clear
	  range in the interval [left right[ in the sequence of a read. E.g. a read with
	  SL=4 and SR=10 has the bases 1,2,3 clipped away on the left side, the bases
	  4,5,6,7,8,9 as clear range and the bases 10 and following clipped away on the
	  right side.
	</p><p>
	  The left clip of a read is determined as max(SL,QL,CL) (the rightmost left
	  clip) whereas the right clip is min(SR,QR,CR).
	</p></div></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_contigs"></a>16.2.3. 
	Contigs
      </h3></div></div></div><p>
	Contigs are not much more than containers containing reads with some
	additional information. Contrary to CAF or ACE, MAF does not first store all reads in
	single containers and then define the contigs. In MAF, contigs are defined as
	outer container and within those, the reads are stored like normal reads.
      </p><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect3_simple_example_2"></a>16.2.3.1. 
	  Simple example 2
	</h4></div></div></div><p>
	  The above example for a read can be encased in a contig like this (with two
	  consensus tags gratuitously added in):
	</p><pre class="screen">
	  CO      contigname_s1
	  NR      1
	  LC      24
	  CS      TGCCTGCAGGTCGACTCTAGAAGG
	  CQ      -+/,36;:6&#8804;3327&lt;7A1/,,).
	  CT      COMM 5 8 Some comment to this consensus tag.
	  CT      COMM 7 12 Another comment to this consensus tag.
	  \\
	  RD      U13a05e07.t1
	  RS      CTTGCATGCCTGCAGGTCGACTCTAGAAGGACCCCGATCA
	  RQ      ,-+*,1-+/,36;:6&#8804;3327&lt;7A1/,,).('..7=@E8:
	  TN      U13a05e07
	  TF      1200
	  TT      1800
	  SF      U13a05e07.t1.scf
	  SL      4
	  SR      32
	  QL      7
	  QR      30
	  AO      1 40 1 40
	  RT      ALUS 10 15 Some comment to this read tag.
	  ST      Sanger
	  ER
	  AT      1 24 7 30
	  //
	  EC
	</pre><p>
	</p><p>
	  Note that the read shown previously (and now encased in a contig) is
	  absolutely unchanged. It has just been complemented with a bit of data which
	  describes the contig as well as with a one liner which places the read into
	  the contig.
	</p></div><div class="sect3"><div class="titlepage"><div><div><h4 class="title"><a name="sect3_list_of_records_for_contigs"></a>16.2.3.2. 
	  List of records for contigs
	</h4></div></div></div><p>
	</p><div class="itemizedlist"><ul class="itemizedlist" style="list-style-type: disc; "><li class="listitem"><p>
	      CO <span class="emphasis"><em>string: contig name</em></span>
	    </p><p> CO starts a contig, the contig name
	    behind is mandatory but can be any string, including numbers.
	    </p></li><li class="listitem"><p>
	      NR <span class="emphasis"><em>integer: num reads in contig</em></span>
	    </p><p> This is optional but highly
	    recommended.
	    </p></li><li class="listitem"><p>
	      LC <span class="emphasis"><em>integer: contig length</em></span>
	    </p><p> Note that this length defines the length of the 'clear
	    range' of the consensus. It is 100% equal to the length of the CS
	    (sequence) and CQ (quality) strings below.
	    </p></li><li class="listitem"><p>
	      CT <span class="emphasis"><em>string + 2 integers + optional string: identifier
	      x1 y1 comment</em></span>
	    </p><p> Consensus tags are defined like read tags but apply to the
	    consensus. Here too, the interval [x1 y1] is including and if x1 &gt; y1, the tag
	    is in reverse direction.
	    </p></li><li class="listitem"><p>
	      CS <span class="emphasis"><em>string: consensus sequence</em></span>
	    </p><p> Sequence of a consensus is stored in RS.
	    </p></li><li class="listitem"><p>
	      CQ <span class="emphasis"><em>string: qualities</em></span>
	    </p><p> Consensus Qualities are stored in FASTQ
	    format, i.e., each quality value + 33 is written as single as ASCII character.
	    </p></li><li class="listitem"><p>
	      \\
	    </p><p> This marks the start of read data of this contig. After
	    this, all reads are stored one after the other, just separated by
	    an "AT" line (see below).
	    </p></li><li class="listitem"><p>
	      AT <span class="emphasis"><em>Four integers: x1 y1 x2 y2</em></span>
	    </p><p> The AT (Assemble_To) line defines the placement of the read
	    in the contig and follows immediately the closing "ER" of a read
	    so that parsers do not need to perform time consuming string
	    lookups. Every read in a contig has exactly one AT line.
	    </p><p> The interval
	    [x2 y2] of the read (i.e., the unclipped data, also called the 'clear range')
	    aligns with the interval [x1 y1] of the contig. If x1 &gt; y1 (the contig
	    positions), then the reverse complement of the read is aligned to the
	    contig. For the read positions, x2 is always &lt; y2.
	    </p></li><li class="listitem"><p>
	      //
	    </p><p> This marks the end of read data
	    </p></li><li class="listitem"><p>
	      EC
	    </p><p> This ends a contig and is mandatory
	    </p></li></ul></div></div></div></div></div><div class="chapter"><div class="titlepage"><div><div><h1 class="title"><a name="chap_logfiles"></a>Chapter 17. Log and temporary files used by MIRA</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Bastien</span> <span class="surname">Chevreux</span></h3><code class="email">&lt;<a class="email" href="mailto:bach@chevreux.org">bach@chevreux.org</a>&gt;</code></div></div><div><p class="releaseinfo">MIRA Version 4.9.5_2</p></div><div><p class="copyright">Copyright © 2014 Bastien Chevreux</p></div></div></div><div class="toc"><p><b>Table of Contents</b></p><dl class="toc"><dt><span class="sect1"><a href="#sect1_logf_introduction">17.1. 
      Introduction
    </a></span></dt><dt><span class="sect1"><a href="#sect1_logf_the_files">17.2. 
      The files
    </a></span></dt><dd><dl><dt><span class="sect2"><a href="#sect2_logf_mira_error_reads_invalid">17.2.1. 
	mira_error_reads_invalid
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_mira_info_reads_tooshort">17.2.2. 
	mira_info_reads_tooshort
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_mira_int_alignextends_preassembly10txt">17.2.3. 
	mira_int_alignextends_preassembly1.0.txt
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_mira_int_clippings0txt">17.2.4. 
	mira_int_clippings.0.txt
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_mira_int_posmatch_megahubs_passxlst">17.2.5. 
	mira_int_posmatch_megahubs_pass.X.lst
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_mira_int_posmatch_multicopystat_preassembly0txt">17.2.6. 
	mira_int_posmatch_multicopystat_preassembly.0.txt
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_mira_int_posmatch_rawhashhits_passxlst">17.2.7. 
	mira_int_posmatch_rawhashhits_pass.X.lst
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_mira_int_skimmarknastyrepeats_hist_passxlst">17.2.8. 
	mira_int_skimmarknastyrepeats_hist_pass.X.lst
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_mira_int_skimmarknastyrepeats_nastyseq_passxlst">17.2.9. 
	mira_int_skimmarknastyrepeats_nastyseq_pass.X.lst
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_mira_int_vectorclip_passxtxt">17.2.10. 
	mira_int_vectorclip_pass.X.txt
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_miratmpads_passxforward_and_miratmpads_passxcomplement">17.2.11. 
	miratmp.ads_pass.X.forward and miratmp.ads_pass.X.complement
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_miratmpads_passxreject">17.2.12. 
	miratmp.ads_pass.X.reject
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_miratmpnoqualities">17.2.13. 
	miratmp.noqualities
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_miratmpusedids">17.2.14. 
	miratmp.usedids
      </a></span></dt><dt><span class="sect2"><a href="#sect2_logf_mira_readpoolinfolst">17.2.15. 
	mira_readpoolinfo.lst
      </a></span></dt></dl></dd></dl></div><div class="blockquote"><table border="0" class="blockquote" style="width: 100%; cellspacing: 0; cellpadding: 0;" summary="Block quote"><tr><td width="10%" valign="top"> </td><td width="80%" valign="top"><p>
      <span class="emphasis"><em><span class="quote">&#8220;<span class="quote">The amount of entropy in the universe is constant - except when it increases.
      </span>&#8221;</span></em></span>
    </p></td><td width="10%" valign="top"> </td></tr><tr><td width="10%" valign="top"> </td><td colspan="2" align="right" valign="top">--<span class="attribution">Solomon Short</span></td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top"><p>
  The documentation of MIRA 3.9.x has not completely caught up yet with the changes introduced by MIRA now using manifest files. Quite a number of recipes still show the old command-line style, e.g.:
  </p><pre class="screen">
mira --project=... --job=... ...</pre><p>
    For those cases, please refer to chapter 3 (the reference) for how to write manifest files.
  </p></td></tr></table></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect1_logf_introduction"></a>17.1. 
      Introduction
    </h2></div></div></div><p>
      The tmp directory used by mira (usually
      <code class="filename">&lt;projectname&gt;_d_tmp</code>) may contain a number of
      files with information which could be interesting for other uses than
      the pure assembly. This guide gives a short overview.
    </p><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      This guide is probably the least complete and most out-of-date as it is
      updated only very infrequently. If in doubt, ask on the MIRA talk
      mailing list.
    </td></tr></table></div><div class="warning" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Warning"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Warning]" src="images/warning.png"></td><th align="left">Warning</th></tr><tr><td align="left" valign="top">
      Please note that the format of these files may change over time,
      although I try very hard to keep changes reduced to a minimum.
    </td></tr></table></div><div class="note" style="margin-left: 0.5in; margin-right: 0.5in;"><table border="0" summary="Note"><tr><td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="images/note.png"></td><th align="left">Note</th></tr><tr><td align="left" valign="top">
      Remember that mira has two options that control whether log and
      temporary files get deleted: while [-OUT:rtd] removes the
      complete tmp directory after an assembly,  [-OUT:rrot] removes
      only those log and temporary files which are not needed anymore for the
      continuation of the assembly. Setting both options to <span class="underline">no</span> will keep all log and temporary files.
    </td></tr></table></div></div><div class="sect1"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="sect1_logf_the_files"></a>17.2. 
      The files
    </h2></div></div></div><p>
    </p><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_mira_error_reads_invalid"></a>17.2.1. 
	mira_error_reads_invalid
      </h3></div></div></div><p>
	A simple list of those reads that were invalid (no sequence or similar
	problems).
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_mira_info_reads_tooshort"></a>17.2.2. 
	mira_info_reads_tooshort
      </h3></div></div></div><p>
	A simple list of those reads that were sorted out because the unclipped
	sequence was too short as defined by [-AS:mrl].
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_mira_int_alignextends_preassembly10txt"></a>17.2.3. 
	mira_int_alignextends_preassembly1.0.txt
      </h3></div></div></div><p>
	If read extension is used ([-DP:ure]), this file contains the read
	name and the number of bases by which the right clipping was extended.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_mira_int_clippings0txt"></a>17.2.4. 
	mira_int_clippings.0.txt
      </h3></div></div></div><p>
	If any of the [-CL:] options leads to the clipping of a read, this
	file will tell when, which clipping, which read and by how much (or to where)
	the clippings were set.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_mira_int_posmatch_megahubs_passxlst"></a>17.2.5. 
	mira_int_posmatch_megahubs_pass.X.lst
      </h3></div></div></div><p>
	Note: replace the <span class="emphasis"><em>X</em></span> by the pass of mira. Should any read be
	categorised as megahub during the all-against-all search (SKIM3), this file
	will tell you which.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_mira_int_posmatch_multicopystat_preassembly0txt"></a>17.2.6. 
	mira_int_posmatch_multicopystat_preassembly.0.txt
      </h3></div></div></div><p>
	After the initial all-against-all search (SKIM3), this file tells you to how
	many other reads each read has overlaps. Furthermore, reads that have more
	overlaps than expected are tagged with ``mc'' (multicopy).
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_mira_int_posmatch_rawhashhits_passxlst"></a>17.2.7. 
	mira_int_posmatch_rawhashhits_pass.X.lst
      </h3></div></div></div><p>
	Note: replace the <span class="emphasis"><em>X</em></span> by the pass of mira. Similar to
	<code class="filename">mira_int_posmatch_multicopystat_preassembly.0.txt</code>, this counts the
	kmer hits of each read to other reads. This time however per pass.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_mira_int_skimmarknastyrepeats_hist_passxlst"></a>17.2.8. 
	mira_int_skimmarknastyrepeats_hist_pass.X.lst
      </h3></div></div></div><p>
	Note: replace the <span class="emphasis"><em>X</em></span> by the pass of mira. Only written if
	 [-KS:mnr] is set to <span class="underline">yes</span>. This file contains a
	histogram of kmer occurrences encountered by SKIM3.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_mira_int_skimmarknastyrepeats_nastyseq_passxlst"></a>17.2.9. 
	mira_int_skimmarknastyrepeats_nastyseq_pass.X.lst
      </h3></div></div></div><p>
	Note: replace the <span class="emphasis"><em>X</em></span> by the pass of mira.  Only written if
	 [-KS:mnr] is set to <span class="underline">yes</span>. One of the more interesting
	files if you want to know the repetitive sequences cause the assembly to be
	really difficult: for each masked part of a read, the masked sequences is
	shown here.
      </p><p>
	E.g.
      </p><pre class="screen">
	U13a04h11.t1    TATATATATATATATATATATATA
	U13a05b01.t1    TATATATATATATATATATATATA
	U13a05c07.t1    AAAAAAAAAAAAAAA
	U13a05e12.t1    CTCTCTCTCTCTCTCTCTCTCTCTCTCTC
      </pre><p>
	Simple repeats like the ones shown above will certainly pop-up there,
	but a few other sequences (like e.g. rDNA/rRNA or SINEs, LINEs in
	eukaryotes) will also appear.
      </p><p>
	Nifty thing to try out if you want to have a more compressed overview: sort
	and unify by the second column.
      </p><pre class="screen">
	sort -k 2 -u mira_int_skimmarknastyrepeats_nastyseq_pass.X.lst
      </pre><p>
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_mira_int_vectorclip_passxtxt"></a>17.2.10. 
	mira_int_vectorclip_pass.X.txt
      </h3></div></div></div><p>
	Note: replace the <span class="emphasis"><em>X</em></span> by the pass of mira. Only written if
	 [-CL:pvlc] is set to <span class="underline">yes</span>. Tells you where possible
	sequencing vector (or adaptor) leftovers were found and clipped (or not
	clipped).
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_miratmpads_passxforward_and_miratmpads_passxcomplement"></a>17.2.11. 
	miratmp.ads_pass.X.forward and miratmp.ads_pass.X.complement
      </h3></div></div></div><p>
	Note: replace the <span class="emphasis"><em>X</em></span> by the pass of mira. Which read aligns with
	Smith-Waterman against which other read, 'forward-forward' and
	'forward-complement'.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_miratmpads_passxreject"></a>17.2.12. 
	miratmp.ads_pass.X.reject
      </h3></div></div></div><p>
	Note: replace the <span class="emphasis"><em>X</em></span> by the pass of mira. Which possible read
	overlaps failed the Smith-Waterman alignment check.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_miratmpnoqualities"></a>17.2.13. 
	miratmp.noqualities
      </h3></div></div></div><p>
	Which reads went completely without qualities into the assembly.
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_miratmpusedids"></a>17.2.14. 
	miratmp.usedids
      </h3></div></div></div><p>
	Which reads effectively went into the assembly (after clipping etc.).
      </p></div><div class="sect2"><div class="titlepage"><div><div><h3 class="title"><a name="sect2_logf_mira_readpoolinfolst"></a>17.2.15. 
	mira_readpoolinfo.lst
      </h3></div></div></div></div></div></div></div></body></html>