This file is indexed.

/usr/share/pyshared/mrjob/emr.py is in python-mrjob 0.3.3.2-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
# Copyright 2009-2012 Yelp and Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import with_statement

from collections import defaultdict
from datetime import datetime
from datetime import timedelta
import fnmatch
import logging
import os
import posixpath
import random
import re
import shlex
import signal
import socket
from subprocess import Popen
from subprocess import PIPE
import time
import urllib2

try:
    from cStringIO import StringIO
    StringIO  # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
    from StringIO import StringIO

try:
    import simplejson as json  # preferred because of C speedups
    json  # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
    import json  # built in to Python 2.6 and later

try:
    import boto
    import boto.ec2
    import boto.emr
    import boto.exception
    import boto.utils
    from mrjob import boto_2_1_1_83aae37b
    boto  # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
    # don't require boto; MRJobs don't actually need it when running
    # inside hadoop streaming
    boto = None

import mrjob
from mrjob import compat
from mrjob.conf import combine_cmds
from mrjob.conf import combine_dicts
from mrjob.conf import combine_lists
from mrjob.conf import combine_paths
from mrjob.conf import combine_path_lists
from mrjob.logparsers import TASK_ATTEMPTS_LOG_URI_RE
from mrjob.logparsers import STEP_LOG_URI_RE
from mrjob.logparsers import EMR_JOB_LOG_URI_RE
from mrjob.logparsers import NODE_LOG_URI_RE
from mrjob.logparsers import scan_for_counters_in_files
from mrjob.logparsers import scan_logs_in_order
from mrjob.parse import is_s3_uri
from mrjob.parse import parse_s3_uri
from mrjob.pool import est_time_to_hour
from mrjob.pool import pool_hash_and_name
from mrjob.retry import RetryWrapper
from mrjob.runner import MRJobRunner
from mrjob.runner import GLOB_RE
from mrjob.ssh import ssh_cat
from mrjob.ssh import ssh_ls
from mrjob.ssh import ssh_copy_key
from mrjob.ssh import ssh_slave_addresses
from mrjob.ssh import SSHException
from mrjob.ssh import SSH_PREFIX
from mrjob.ssh import SSH_LOG_ROOT
from mrjob.ssh import SSH_URI_RE
from mrjob.util import buffer_iterator_to_line_iterator
from mrjob.util import cmd_line
from mrjob.util import extract_dir_for_tar
from mrjob.util import hash_object
from mrjob.util import read_file


log = logging.getLogger('mrjob.emr')

JOB_TRACKER_RE = re.compile('(\d{1,3}\.\d{2})%')

# if EMR throttles us, how long to wait (in seconds) before trying again?
EMR_BACKOFF = 20
EMR_BACKOFF_MULTIPLIER = 1.5
EMR_MAX_TRIES = 20  # this takes about a day before we run out of tries

# the port to tunnel to
EMR_JOB_TRACKER_PORT = 9100
EMR_JOB_TRACKER_PATH = '/jobtracker.jsp'

MAX_SSH_RETRIES = 20

# ssh should fail right away if it can't bind a port
WAIT_FOR_SSH_TO_FAIL = 1.0

# sometimes AWS gives us seconds as a decimal, which we can't parse
# with boto.utils.ISO8601
SUBSECOND_RE = re.compile('\.[0-9]+')

# map from AWS region to EMR endpoint. See
# http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?ConceptsRequestEndpoints.html
REGION_TO_EMR_ENDPOINT = {
    'EU': 'eu-west-1.elasticmapreduce.amazonaws.com',
    'us-east-1': 'us-east-1.elasticmapreduce.amazonaws.com',
    'us-west-1': 'us-west-1.elasticmapreduce.amazonaws.com',
    '': 'elasticmapreduce.amazonaws.com',  # when no region specified
}

# map from AWS region to S3 endpoint. See
# http://docs.amazonwebservices.com/AmazonS3/latest/dev/MakingRequests.html#RequestEndpoints
REGION_TO_S3_ENDPOINT = {
    'EU': 's3-eu-west-1.amazonaws.com',
    'us-east-1': 's3.amazonaws.com',  # no region-specific endpoint
    'us-west-1': 's3-us-west-1.amazonaws.com',
    'ap-southeast-1': 's3-ap-southeast-1.amazonaws.com',  # no EMR endpoint yet
    '': 's3.amazonaws.com',
}

# map from AWS region to S3 LocationConstraint parameter for regions whose
# location constraints differ from their AWS regions. See
# http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?RESTBucketPUT.html
REGION_TO_S3_LOCATION_CONSTRAINT = {
    'us-east-1': '',
}


# map from instance type to number of compute units
# from http://aws.amazon.com/ec2/instance-types/
EC2_INSTANCE_TYPE_TO_COMPUTE_UNITS = {
    't1.micro': 2,
    'm1.small': 1,
    'm1.large': 4,
    'm1.xlarge': 8,
    'm2.xlarge': 6.5,
    'm2.2xlarge': 13,
    'm2.4xlarge': 26,
    'c1.medium': 5,
    'c1.xlarge': 20,
    'cc1.4xlarge': 33.5,
    'cg1.4xlarge': 33.5,
}


# map from instance type to GB of memory
# from http://aws.amazon.com/ec2/instance-types/
EC2_INSTANCE_TYPE_TO_MEMORY = {
    't1.micro': 0.6,
    'm1.small': 1.7,
    'm1.large': 7.5,
    'm1.xlarge': 15,
    'm2.xlarge': 17.5,
    'm2.2xlarge': 34.2,
    'm2.4xlarge': 68.4,
    'c1.medium': 1.7,
    'c1.xlarge': 7,
    'cc1.4xlarge': 23,
    'cg1.4xlarge': 22,
}

# Use this to figure out which hadoop version we're using if it's not
# explicitly specified, so we can keep from passing deprecated command-line
# options to Hadoop. If we encounter an AMI version we don't recognize,
# we use whatever version matches 'latest'.
#
# The reason we don't just create a job flow and then query its Hadoop version
# is that for most jobs, we create the steps and the job flow at the same time.
AMI_VERSION_TO_HADOOP_VERSION = {
    None: '0.18',  # ami_version not specified means version 1.0
    '1.0': '0.18',
    '2.0': '0.20.205',
    'latest': '0.20.205',
}

# EMR's hard limit on number of steps in a job flow
MAX_STEPS_PER_JOB_FLOW = 256


def s3_key_to_uri(s3_key):
    """Convert a boto Key object into an ``s3://`` URI"""
    return 's3://%s/%s' % (s3_key.bucket.name, s3_key.name)


# AWS actually gives dates in two formats, and we only recently started using
# API calls that return the second. So the date parsing function is called
# iso8601_to_*, but it also parses RFC1123.
# Until boto starts seamlessly parsing these, we check for them ourselves.

# Thu, 29 Mar 2012 04:55:44 GMT
RFC1123 = '%a, %d %b %Y %H:%M:%S %Z'

def iso8601_to_timestamp(iso8601_time):
    iso8601_time = SUBSECOND_RE.sub('', iso8601_time)
    try:
        return time.mktime(time.strptime(iso8601_time, boto.utils.ISO8601))
    except ValueError:
        return time.mktime(time.strptime(iso8601_time, RFC1123))


def iso8601_to_datetime(iso8601_time):
    iso8601_time = SUBSECOND_RE.sub('', iso8601_time)
    try:
        return datetime.strptime(iso8601_time, boto.utils.ISO8601)
    except ValueError:
        return datetime.strptime(iso8601_time, RFC1123)


def describe_all_job_flows(emr_conn, states=None, jobflow_ids=None,
                           created_after=None, created_before=None):
    """Iteratively call ``EmrConnection.describe_job_flows()`` until we really
    get all the available job flow information. Currently, 2 months of data
    is available through the EMR API.

    This is a way of getting around the limits of the API, both on number
    of job flows returned, and how far back in time we can go.

    :type states: list
    :param states: A list of strings with job flow states wanted
    :type jobflow_ids: list
    :param jobflow_ids: A list of job flow IDs
    :type created_after: datetime
    :param created_after: Bound on job flow creation time
    :type created_before: datetime
    :param created_before: Bound on job flow creation time
    """
    all_job_flows = []
    ids_seen = set()

    # weird things can happen if we send no args the DescribeJobFlows API
    # (see Issue #346), so if nothing else is set, set created_before
    # to a day in the future.
    if not (states or jobflow_ids or created_after or created_before):
        created_before = datetime.utcnow() + timedelta(days=1)

    while True:
        if created_before and created_after and created_before < created_after:
            break

        log.debug('Calling describe_jobflows(states=%r, jobflow_ids=%r,'
                  ' created_after=%r, created_before=%r)' %
                  (states, jobflow_ids, created_after, created_before))
        try:
            results = emr_conn.describe_jobflows(
                states=states, jobflow_ids=jobflow_ids,
                created_after=created_after, created_before=created_before)
        except boto.exception.BotoServerError, ex:
            if 'ValidationError' in ex.body:
                log.debug(
                    '  reached earliest allowed created_before time, done!')
                break
            else:
                raise

        # don't count the same job flow twice
        job_flows = [jf for jf in results if jf.jobflowid not in ids_seen]
        log.debug('  got %d results (%d new)' % (len(results), len(job_flows)))

        all_job_flows.extend(job_flows)
        ids_seen.update(jf.jobflowid for jf in job_flows)

        if job_flows:
            # set created_before to be just after the start time of
            # the first job returned, to deal with job flows started
            # in the same second
            min_create_time = min(iso8601_to_datetime(jf.creationdatetime)
                                  for jf in job_flows)
            created_before = min_create_time + timedelta(seconds=1)
            # if someone managed to start 501 job flows in the same second,
            # they are still screwed (the EMR API only returns up to 500),
            # but this seems unlikely. :)
        else:
            if not created_before:
                created_before = datetime.utcnow()
            created_before -= timedelta(weeks=2)

    return all_job_flows


def make_lock_uri(s3_tmp_uri, emr_job_flow_id, step_num):
    """Generate the URI to lock the job flow ``emr_job_flow_id``"""
    return s3_tmp_uri + 'locks/' + emr_job_flow_id + '/' + str(step_num)


def _lock_acquire_step_1(s3_conn, lock_uri, job_name, mins_to_expiration=None):
    bucket_name, key_prefix = parse_s3_uri(lock_uri)
    bucket = s3_conn.get_bucket(bucket_name)
    key = bucket.get_key(key_prefix)

    # EMRJobRunner should start using a job flow within about a second of
    # locking it, so if it's been a while, then it probably crashed and we
    # can just use this job flow.
    key_expired = False
    if key and mins_to_expiration is not None:
        last_modified = iso8601_to_datetime(key.last_modified)
        age = datetime.utcnow() - last_modified
        if age > timedelta(minutes=mins_to_expiration):
            key_expired = True

    if key is None or key_expired:
        key = bucket.new_key(key_prefix)
        key.set_contents_from_string(job_name)
        return key
    else:
        return None


def _lock_acquire_step_2(key, job_name):
    key_value = key.get_contents_as_string()
    return (key_value == job_name)


def attempt_to_acquire_lock(s3_conn, lock_uri, sync_wait_time, job_name,
                            mins_to_expiration=None):
    """Returns True if this session successfully took ownership of the lock
    specified by ``lock_uri``.
    """
    key = _lock_acquire_step_1(s3_conn, lock_uri, job_name, mins_to_expiration)
    if key is not None:
        time.sleep(sync_wait_time)
        success = _lock_acquire_step_2(key, job_name)
        if success:
            return True

    return False


class LogFetchError(Exception):
    pass


class EMRJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on Amazon Elastic MapReduce.

    :py:class:`EMRJobRunner` runs your job in an EMR job flow, which is
    basically a temporary Hadoop cluster. Normally, it creates a job flow
    just for your job; it's also possible to run your job in a specific
    job flow by setting *emr_job_flow_id* or to automatically choose a
    waiting job flow, creating one if none exists, by setting
    *pool_emr_job_flows*.

    Input, support, and jar files can be either local or on S3; use
    ``s3://...`` URLs to refer to files on S3.

    This class has some useful utilities for talking directly to S3 and EMR,
    so you may find it useful to instantiate it without a script::

        from mrjob.emr import EMRJobRunner

        emr_conn = EMRJobRunner().make_emr_conn()
        job_flows = emr_conn.describe_jobflows()
        ...

    See also: :py:meth:`~EMRJobRunner.__init__`.
    """
    alias = 'emr'

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.emr.EMRJobRunner` takes the same arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.

        *aws_access_key_id* and *aws_secret_access_key* are required if you
        haven't set them up already for boto (e.g. by setting the environment
        variables :envvar:`AWS_ACCESS_KEY_ID` and
        :envvar:`AWS_SECRET_ACCESS_KEY`)

        Additional options:

        :type additional_emr_info: JSON str, None, or JSON-encodable object
        :param additional_emr_info: Special parameters to select additional
                                    features, mostly to support beta EMR
                                    features. Pass a JSON string on the command
                                    line or use data structures in the config
                                    file (which is itself basically JSON).
        :type ami_version: str
        :param ami_version: EMR AMI version to use. This controls which Hadoop
                            version(s) are available and which version of
                            Python is installed, among other things; see \
http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuideindex.html?EnvironmentConfig_AMIVersion.html
                            for details. Implicitly defaults to AMI version
                            1.0 (this will change to 2.0 in mrjob v0.4).
        :type aws_access_key_id: str
        :param aws_access_key_id: "username" for Amazon web services.
        :type aws_availability_zone: str
        :param aws_availability_zone: availability zone to run the job in
        :type aws_secret_access_key: str
        :param aws_secret_access_key: your "password" on AWS
        :type aws_region: str
        :param aws_region: region to connect to S3 and EMR on (e.g.
                           ``us-west-1``). If you want to use separate regions
                           for S3 and EMR, set *emr_endpoint* and
                           *s3_endpoint*.
        :type bootstrap_actions: list of str
        :param bootstrap_actions: a list of raw bootstrap actions (essentially
                                  scripts) to run prior to any of the other
                                  bootstrap steps. Any arguments should be
                                  separated from the command by spaces (we use
                                  :py:func:`shlex.split`). If the action is on
                                  the local filesystem, we'll automatically
                                  upload it to S3.
        :type bootstrap_cmds: list
        :param bootstrap_cmds: a list of commands to run on the master node to
                               set up libraries, etc. Like *setup_cmds*, these
                               can be strings, which will be run in the shell,
                               or lists of args, which will be run directly.
                               Prepend ``sudo`` to commands to do things that
                               require root privileges.
        :type bootstrap_files: list of str
        :param bootstrap_files: files to download to the bootstrap working
                                directory on the master node before running
                                *bootstrap_cmds* (for example, Debian
                                packages). May be local files for mrjob to
                                upload to S3, or any URI that ``hadoop fs``
                                can handle.
        :type bootstrap_mrjob: boolean
        :param bootstrap_mrjob: This is actually an option in the base
                                :py:class:`~mrjob.job.MRJobRunner` class. If
                                this is ``True`` (the default), we'll tar up
                                :py:mod:`mrjob` from the local filesystem, and
                                install it on the master node.
        :type bootstrap_python_packages: list of str
        :param bootstrap_python_packages: paths of python modules to install
                                          on EMR. These should be standard
                                          Python module tarballs. If a module
                                          is named ``foo.tar.gz``, we expect to
                                          be able to run ``tar xfz foo.tar.gz;
                                          cd foo;
                                          sudo python setup.py install``.
        :type bootstrap_scripts: list of str
        :param bootstrap_scripts: scripts to upload and then run on the master
                                  node (a combination of *bootstrap_cmds* and
                                  *bootstrap_files*). These are run after the
                                  command from bootstrap_cmds.
        :type check_emr_status_every: float
        :param check_emr_status_every: How often to check on the status of EMR
                                       jobs. Default is 30 seconds (too often
                                       and AWS will throttle you anyway).
        :type ec2_instance_type: str
        :param ec2_instance_type: What sort of EC2 instance(s) to use on the
                                  nodes that actually run tasks (see
                                  http://aws.amazon.com/ec2/instance-types/).
                                  When you run multiple instances (see
                                  *num_ec2_instances*), the master node is just
                                  coordinating the other nodes, so usually the
                                  default instance type (``m1.small``) is fine,
                                  and using larger instances is wasteful.
        :type ec2_key_pair: str
        :param ec2_key_pair: name of the SSH key you set up for EMR.
        :type ec2_key_pair_file: str
        :param ec2_key_pair_file: path to file containing the SSH key for EMR
        :type ec2_core_instance_type: str
        :param ec2_core_instance_type: like *ec2_instance_type*, but only
                                       for the core (also know as "slave")
                                       Hadoop nodes; these nodes run tasks and
                                       host HDFS. Usually you just want to use
                                       *ec2_instance_type*. Defaults to
                                       ``'m1.small'``.
        :type ec2_core_instance_bid_price: str
        :param ec2_core_instance_bid_price: when specified and not "0", this
                                            creates the master Hadoop node as
                                            a spot instance at this bid price.
                                            You usually only want to set
                                            bid price for task instances.
        :type ec2_master_instance_type: str
        :param ec2_master_instance_type: like *ec2_instance_type*, but only
                                         for the master Hadoop node. This node
                                         hosts the task tracker and HDFS, and
                                         runs tasks if there are no other
                                         nodes. Usually you just want to use
                                         *ec2_instance_type*. Defaults to
                                         ``'m1.small'``.
        :type ec2_master_instance_bid_price: str
        :param ec2_master_instance_bid_price: when specified and not "0", this
                                              creates the master Hadoop node as
                                              a spot instance at this bid
                                              price. You usually only want to
                                              set bid price for task instances
                                              unless the master instance is
                                              your only instance.
        :type ec2_slave_instance_type: str
        :param ec2_slave_instance_type: An alias for *ec2_core_instance_type*,
                                        for consistency with the EMR API.
        :type ec2_task_instance_type: str
        :param ec2_task_instance_type: like *ec2_instance_type*, but only
                                       for the task Hadoop nodes; these nodes
                                       run tasks but do not host HDFS. Usually
                                       you just want to use
                                       *ec2_instance_type*. Defaults to
                                       the same instance type as
                                       *ec2_core_instance_type*.
        :param ec2_task_instance_bid_price: when specified and not "0", this
                                            creates the master Hadoop node as
                                            a spot instance at this bid price.
                                            (You usually only want to set
                                            bid price for task instances.)
        :type emr_endpoint: str
        :param emr_endpoint: optional host to connect to when communicating
                             with S3 (e.g.
                             ``us-west-1.elasticmapreduce.amazonaws.com``).
                             Default is to infer this from *aws_region*.
        :type emr_job_flow_id: str
        :param emr_job_flow_id: the ID of a persistent EMR job flow to run jobs
                                in (normally we launch our own job flow). It's
                                fine for other jobs to be using the job flow;
                                we give our job's steps a unique ID.
        :type emr_job_flow_pool_name: str
        :param emr_job_flow_pool_name: Specify a pool name to join. Is set to
                                       ``'default'`` if not specified. Does not
                                       imply ``pool_emr_job_flows``.
        :type enable_emr_debugging: str
        :param enable_emr_debugging: store Hadoop logs in SimpleDB
        :type hadoop_streaming_jar: str
        :param hadoop_streaming_jar: This is actually an option in the base
                                     :py:class:`~mrjob.runner.MRJobRunner`
                                     class. Points to a custom hadoop streaming
                                     jar on the local filesystem or S3. If you
                                     want to point to a streaming jar already
                                     installed on the EMR instances (perhaps
                                     through a bootstrap action?), use
                                     *hadoop_streaming_jar_on_emr*.
        :type hadoop_streaming_jar_on_emr: str
        :param hadoop_streaming_jar_on_emr: Like *hadoop_streaming_jar*, except
                                            that it points to a path on the EMR
                                            instance, rather than to a local
                                            file or one on S3. Rarely necessary
                                            to set this by hand.
        :type hadoop_version: str
        :param hadoop_version: Set the version of Hadoop to use on EMR.
                               Consider setting *ami_version* instead; only AMI
                               version 1.0 supports multiple versions of Hadoop
                               anyway. If *ami_version* is not set, we'll
                               default to Hadoop 0.20 for backwards
                               compatibility with :py:mod:`mrjob` v0.3.0.
        :type num_ec2_core_instances: int
        :param num_ec2_core_instances: Number of core (or "slave") instances to
                                       start up. These run your job and host
                                       HDFS. Incompatible with
                                       *num_ec2_instances*. This is in addition
                                       to the single master instance.
        :type num_ec2_instances: int
        :param num_ec2_instances: Total number of instances to start up;
                                  basically the number of core instance you
                                  want, plus 1 (there is always one master
                                  instance). Default is ``1``. Incompatible
                                  with *num_ec2_core_instances* and
                                  *num_ec2_task_instances*.
        :type num_ec2_task_instances: int
        :param num_ec2_task_instances: number of task instances to start up.
                                       These run your job but do not host
                                       HDFS. Incompatible with
                                       *num_ec2_instances*. If you use this,
                                       you must set *num_ec2_core_instances*;
                                       EMR does not allow you to run task
                                       instances without core instances (because
                                       there's nowhere to host HDFS).
        :type pool_emr_job_flows: bool
        :param pool_emr_job_flows: Try to run the job on a ``WAITING`` pooled
                                   job flow with the same bootstrap
                                   configuration. Prefer the one with the most
                                   compute units. Use S3 to "lock" the job flow
                                   and ensure that the job is not scheduled
                                   behind another job. If no suitable job flow
                                   is `WAITING`, create a new pooled job flow.
                                   **WARNING**: do not run this without having\
        :py:mod:`mrjob.tools.emr.terminate.idle_job_flows`
                                   in your crontab; job flows left idle can
                                   quickly become expensive!
        :type s3_endpoint: str
        :param s3_endpoint: Host to connect to when communicating with S3 (e.g.
                            ``s3-us-west-1.amazonaws.com``). Default is to
                            infer this from *aws_region*.
        :type s3_log_uri: str
        :param s3_log_uri: where on S3 to put logs, for example
                           ``s3://yourbucket/logs/``. Logs for your job flow
                           will go into a subdirectory, e.g.
                           ``s3://yourbucket/logs/j-JOBFLOWID/``. in this
                           example s3://yourbucket/logs/j-YOURJOBID/). Default
                           is to append ``logs/`` to *s3_scratch_uri*.
        :type s3_scratch_uri: str
        :param s3_scratch_uri: S3 directory (URI ending in ``/``) to use as
                               scratch space, e.g. ``s3://yourbucket/tmp/``.
                               Default is ``tmp/mrjob/`` in the first bucket
                               belonging to you.
        :type s3_sync_wait_time: float
        :param s3_sync_wait_time: How long to wait for S3 to reach eventual
                                  consistency. This is typically less than a
                                  second (zero in U.S. West) but the default is
                                  5.0 to be safe.
        :type ssh_bin: str or list
        :param ssh_bin: path to the ssh binary; may include switches (e.g.
                        ``'ssh -v'`` or ``['ssh', '-v']``). Defaults to
                        :command:`ssh`
        :type ssh_bind_ports: list of int
        :param ssh_bind_ports: a list of ports that are safe to listen on.
                               Defaults to ports ``40001`` thru ``40840``.
        :type ssh_tunnel_to_job_tracker: bool
        :param ssh_tunnel_to_job_tracker: If True, create an ssh tunnel to the
                                          job tracker and listen on a randomly
                                          chosen port. This requires you to set
                                          *ec2_key_pair* and
                                          *ec2_key_pair_file*. See
                                          :ref:`ssh-tunneling` for detailed
                                          instructions.
        :type ssh_tunnel_is_open: bool
        :param ssh_tunnel_is_open: if True, any host can connect to the job
                                   tracker through the SSH tunnel you open.
                                   Mostly useful if your browser is running on
                                   a different machine from your job runner.
        """
        super(EMRJobRunner, self).__init__(**kwargs)

        # make aws_region an instance variable; we might want to set it
        # based on the scratch bucket
        self._aws_region = self._opts['aws_region'] or ''

        # if we're going to create a bucket to use as temp space, we don't
        # want to actually create it until we run the job (Issue #50).
        # This variable helps us create the bucket as needed
        self._s3_temp_bucket_to_create = None

        self._fix_s3_scratch_and_log_uri_opts()

        self._fix_ec2_instance_opts()

        # pick a tmp dir based on the job name
        self._s3_tmp_uri = self._opts['s3_scratch_uri'] + self._job_name + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = self._check_and_fix_s3_dir(self._output_dir)
        else:
            self._output_dir = self._s3_tmp_uri + 'output/'

        # add the bootstrap files to a list of files to upload
        self._bootstrap_actions = []
        for action in self._opts['bootstrap_actions']:
            args = shlex.split(action)
            if not args:
                raise ValueError('bad bootstrap action: %r' % (action,))
            # don't use _add_bootstrap_file() because this is a raw bootstrap
            # action, not part of mrjob's bootstrap utilities
            file_dict = self._add_file(args[0])
            file_dict['args'] = args[1:]
            self._bootstrap_actions.append(file_dict)

        for path in self._opts['bootstrap_files']:
            self._add_bootstrap_file(path)

        self._bootstrap_scripts = []
        for path in self._opts['bootstrap_scripts']:
            file_dict = self._add_bootstrap_file(path)
            self._bootstrap_scripts.append(file_dict)

        self._bootstrap_python_packages = []
        for path in self._opts['bootstrap_python_packages']:
            name, path = self._split_path(path)
            if not path.endswith('.tar.gz'):
                raise ValueError(
                    'bootstrap_python_packages only accepts .tar.gz files!')
            file_dict = self._add_bootstrap_file(path)
            self._bootstrap_python_packages.append(file_dict)

        self._streaming_jar = None
        if self._opts.get('hadoop_streaming_jar'):
            self._streaming_jar = self._add_file_for_upload(
                self._opts['hadoop_streaming_jar'])

        if not (isinstance(self._opts['additional_emr_info'], basestring)
                or self._opts['additional_emr_info'] is None):
            self._opts['additional_emr_info'] = json.dumps(
                self._opts['additional_emr_info'])

        # if we're bootstrapping mrjob, keep track of the file_dict
        # for mrjob.tar.gz
        self._mrjob_tar_gz_file = None

        # where our own logs ended up (we'll find this out once we run the job)
        self._s3_job_log_uri = None

        # where to get input from. We'll fill this later. Once filled,
        # this must be a list (not some other sort of container)
        self._s3_input_uris = None

        # we'll create the script later
        self._master_bootstrap_script = None

        # the ID assigned by EMR to this job (might be None)
        self._emr_job_flow_id = self._opts['emr_job_flow_id']

        # when did our particular task start?
        self._emr_job_start = None

        # ssh state
        self._ssh_proc = None
        self._gave_cant_ssh_warning = False
        self._ssh_key_name = None

        # cache for SSH address
        self._address = None
        self._ssh_slave_addrs = None

        # store the tracker URL for completion status
        self._tracker_url = None

        # turn off tracker progress until tunnel is up
        self._show_tracker_progress = False

        # default requested hadoop version if AMI version is not set
        if not (self._opts['ami_version'] or self._opts['hadoop_version']):
            self._opts['hadoop_version'] = '0.20'

        # init hadoop version cache
        self._inferred_hadoop_version = None

    @classmethod
    def _allowed_opts(cls):
        """A list of which keyword args we can pass to __init__()"""
        return super(EMRJobRunner, cls)._allowed_opts() + [
            'additional_emr_info',
            'ami_version',
            'aws_access_key_id',
            'aws_availability_zone',
            'aws_region',
            'aws_secret_access_key',
            'bootstrap_actions',
            'bootstrap_cmds',
            'bootstrap_files',
            'bootstrap_python_packages',
            'bootstrap_scripts',
            'check_emr_status_every',
            'ec2_core_instance_bid_price',
            'ec2_core_instance_type',
            'ec2_instance_type',
            'ec2_key_pair',
            'ec2_key_pair_file',
            'ec2_master_instance_bid_price',
            'ec2_master_instance_type',
            'ec2_slave_instance_type',
            'ec2_task_instance_bid_price',
            'ec2_task_instance_type',
            'emr_endpoint',
            'emr_job_flow_id',
            'emr_job_flow_pool_name',
            'enable_emr_debugging',
            'enable_emr_debugging',
            'hadoop_streaming_jar_on_emr',
            'hadoop_version',
            'num_ec2_core_instances',
            'num_ec2_instances',
            'num_ec2_task_instances',
            'pool_emr_job_flows',
            's3_endpoint',
            's3_log_uri',
            's3_scratch_uri',
            's3_sync_wait_time',
            'ssh_bin',
            'ssh_bind_ports',
            'ssh_tunnel_is_open',
            'ssh_tunnel_to_job_tracker',
        ]

    @classmethod
    def _default_opts(cls):
        """A dictionary giving the default value of options."""
        return combine_dicts(super(EMRJobRunner, cls)._default_opts(), {
            'check_emr_status_every': 30,
            'ec2_core_instance_type': 'm1.small',
            'ec2_master_instance_type': 'm1.small',
            'emr_job_flow_pool_name': 'default',
            'hadoop_version': None,  # defaulted in __init__()
            'hadoop_streaming_jar_on_emr':
                '/home/hadoop/contrib/streaming/hadoop-streaming.jar',
            'num_ec2_core_instances': 0,
            'num_ec2_instances': 1,
            'num_ec2_task_instances': 0,
            's3_sync_wait_time': 5.0,
            'ssh_bin': ['ssh'],
            'ssh_bind_ports': range(40001, 40841),
            'ssh_tunnel_to_job_tracker': False,
            'ssh_tunnel_is_open': False,
        })

    @classmethod
    def _opts_combiners(cls):
        """Map from option name to a combine_*() function used to combine
        values for that option. This allows us to specify that some options
        are lists, or contain environment variables, or whatever."""
        return combine_dicts(super(EMRJobRunner, cls)._opts_combiners(), {
            'bootstrap_actions': combine_lists,
            'bootstrap_cmds': combine_lists,
            'bootstrap_files': combine_path_lists,
            'bootstrap_python_packages': combine_path_lists,
            'bootstrap_scripts': combine_path_lists,
            'ec2_key_pair_file': combine_paths,
            's3_log_uri': combine_paths,
            's3_scratch_uri': combine_paths,
            'ssh_bin': combine_cmds,
        })

    def _fix_ec2_instance_opts(self):
        """If the *ec2_instance_type* option is set, override instance
        type for the nodes that actually run tasks (see Issue #66). Allow
        command-line arguments to override defaults and arguments
        in mrjob.conf (see Issue #311).

        Also, make sure that core and slave instance type are the same,
        total number of instances matches number of master, core, and task
        instances, and that bid prices of zero are converted to None.

        Helper for __init__.
        """
        # Make sure slave and core instance type have the same value
        # Within EMRJobRunner we only ever use ec2_core_instance_type,
        # but we want ec2_slave_instance_type to be correct in the
        # options dictionary.
        if (self._opts['ec2_slave_instance_type'] and
            (self._opt_priority['ec2_slave_instance_type'] >
             self._opt_priority['ec2_core_instance_type'])):
            self._opts['ec2_core_instance_type'] = (
                self._opts['ec2_slave_instance_type'])
        else:
            self._opts['ec2_slave_instance_type'] = (
                self._opts['ec2_core_instance_type'])

        # If task instance type is not set, use core instance type
        # (This is mostly so that we don't inadvertently join a pool
        # with task instance types with too little memory.)
        if not self._opts['ec2_task_instance_type']:
            self._opts['ec2_task_instance_type'] = (
                self._opts['ec2_core_instance_type'])

        # Within EMRJobRunner, we use num_ec2_core_instances and
        # num_ec2_task_instances, not num_ec2_instances. (Number
        # of master instances is always 1.)
        if (self._opt_priority['num_ec2_instances'] >
            max(self._opt_priority['num_ec2_core_instances'],
                self._opt_priority['num_ec2_task_instances'])):
            # assume 1 master, n - 1 core, 0 task
            self._opts['num_ec2_core_instances'] = (
                self._opts['num_ec2_instances'] - 1)
            self._opts['num_ec2_task_instances'] = 0
        else:
            # issue a warning if we used both kinds of instance number
            # options on the command line or in mrjob.conf
            if (self._opt_priority['num_ec2_instances'] >= 2 and
                self._opt_priority['num_ec2_instances'] <=
                max(self._opt_priority['num_ec2_core_instances'],
                    self._opt_priority['num_ec2_task_instances'])):
                log.warn('Mixing num_ec2_instances and'
                         ' num_ec2_{core,task}_instances does not make sense;'
                         ' ignoring num_ec2_instances')
            # recalculate number of EC2 instances
            self._opts['num_ec2_instances'] = (
                1 +
                self._opts['num_ec2_core_instances'] +
                self._opts['num_ec2_task_instances'])

        # Allow ec2 instance type to override other instance types
        ec2_instance_type = self._opts['ec2_instance_type']
        if ec2_instance_type:
            # core (slave) instances
            if (self._opt_priority['ec2_instance_type'] >
                max(self._opt_priority['ec2_core_instance_type'],
                    self._opt_priority['ec2_slave_instance_type'])):
                self._opts['ec2_core_instance_type'] = ec2_instance_type
                self._opts['ec2_slave_instance_type'] = ec2_instance_type

            # master instance only does work when it's the only instance
            if (self._opts['num_ec2_core_instances'] <= 0 and
                self._opts['num_ec2_task_instances'] <= 0 and
                (self._opt_priority['ec2_instance_type'] >
                 self._opt_priority['ec2_master_instance_type'])):
                self._opts['ec2_master_instance_type'] = ec2_instance_type

            # task instances
            if (self._opt_priority['ec2_instance_type'] >
                self._opt_priority['ec2_task_instance_type']):
                self._opts['ec2_task_instance_type'] = ec2_instance_type

        # convert a bid price of '0' to None
        for role in ('core', 'master', 'task'):
            opt_name = 'ec2_%s_instance_bid_price' % role
            if not self._opts[opt_name]:
                self._opts[opt_name] = None
            else:
                # convert "0", "0.00" etc. to None
                try:
                    value = float(self._opts[opt_name])
                    if value == 0:
                        self._opts[opt_name] = None
                except ValueError:
                    pass  # maybe EMR will accept non-floats?

    def _fix_s3_scratch_and_log_uri_opts(self):
        """Fill in s3_scratch_uri and s3_log_uri (in self._opts) if they
        aren't already set.

        Helper for __init__.
        """
        s3_conn = self.make_s3_conn()
        # check s3_scratch_uri against aws_region if specified
        if self._opts['s3_scratch_uri']:
            bucket_name, _ = parse_s3_uri(self._opts['s3_scratch_uri'])
            bucket_loc = s3_conn.get_bucket(bucket_name).get_location()

            # make sure they can communicate if both specified
            if (self._aws_region and bucket_loc and
                self._aws_region != bucket_loc):
                log.warning('warning: aws_region (%s) does not match bucket'
                            ' region (%s). Your EC2 instances may not be able'
                            ' to reach your S3 buckets.' %
                            (self._aws_region, bucket_loc))

            # otherwise derive aws_region from bucket_loc
            elif bucket_loc and not self._aws_region:
                log.info(
                    "inferring aws_region from scratch bucket's region (%s)" %
                    bucket_loc)
                self._aws_region = bucket_loc
        # set s3_scratch_uri by checking for existing buckets
        else:
            self._set_s3_scratch_uri(s3_conn)
            log.info('using %s as our scratch dir on S3' %
                     self._opts['s3_scratch_uri'])

        self._opts['s3_scratch_uri'] = self._check_and_fix_s3_dir(
            self._opts['s3_scratch_uri'])

        # set s3_log_uri
        if self._opts['s3_log_uri']:
            self._opts['s3_log_uri'] = self._check_and_fix_s3_dir(
                self._opts['s3_log_uri'])
        else:
            self._opts['s3_log_uri'] = self._opts['s3_scratch_uri'] + 'logs/'

    def _set_s3_scratch_uri(self, s3_conn):
        """Helper for _fix_s3_scratch_and_log_uri_opts"""
        buckets = s3_conn.get_all_buckets()
        mrjob_buckets = [b for b in buckets if b.name.startswith('mrjob-')]

        # Loop over buckets until we find one that is not region-
        #   restricted, matches aws_region, or can be used to
        #   infer aws_region if no aws_region is specified
        for scratch_bucket in mrjob_buckets:
            scratch_bucket_name = scratch_bucket.name
            scratch_bucket_location = scratch_bucket.get_location()

            if scratch_bucket_location:
                if scratch_bucket_location == self._aws_region:
                    # Regions are both specified and match
                    log.info("using existing scratch bucket %s" %
                             scratch_bucket_name)
                    self._opts['s3_scratch_uri'] = (
                        's3://%s/tmp/' % scratch_bucket_name)
                    return
                elif not self._aws_region:
                    # aws_region not specified, so set it based on this
                    #   bucket's location and use this bucket
                    self._aws_region = scratch_bucket_location
                    log.info("inferring aws_region from scratch bucket's"
                             " region (%s)" % self._aws_region)
                    self._opts['s3_scratch_uri'] = (
                        's3://%s/tmp/' % scratch_bucket_name)
                    return
                elif scratch_bucket_location != self._aws_region:
                    continue
            elif not self._aws_region:
                # Only use regionless buckets if the job flow is regionless
                log.info("using existing scratch bucket %s" %
                         scratch_bucket_name)
                self._opts['s3_scratch_uri'] = (
                    's3://%s/tmp/' % scratch_bucket_name)
                return

        # That may have all failed. If so, pick a name.
        scratch_bucket_name = 'mrjob-%016x' % random.randint(0, 2 ** 64 - 1)
        self._s3_temp_bucket_to_create = scratch_bucket_name
        log.info("creating new scratch bucket %s" % scratch_bucket_name)
        self._opts['s3_scratch_uri'] = 's3://%s/tmp/' % scratch_bucket_name

    def _set_s3_job_log_uri(self, job_flow):
        """Given a job flow description, set self._s3_job_log_uri. This allows
        us to call self.ls(), etc. without running the job.
        """
        log_uri = getattr(job_flow, 'loguri', '')
        if log_uri:
            self._s3_job_log_uri = '%s%s/' % (
                log_uri.replace('s3n://', 's3://'), self._emr_job_flow_id)

    def _create_s3_temp_bucket_if_needed(self):
        """Make sure temp bucket exists"""
        if self._s3_temp_bucket_to_create:
            s3_conn = self.make_s3_conn()
            log.info('creating S3 bucket %r to use as scratch space' %
                     self._s3_temp_bucket_to_create)
            location = REGION_TO_S3_LOCATION_CONSTRAINT.get(
                self._aws_region, self._aws_region)
            s3_conn.create_bucket(self._s3_temp_bucket_to_create,
                                  location=(location or ''))
            self._s3_temp_bucket_to_create = None

    def _check_and_fix_s3_dir(self, s3_uri):
        """Helper for __init__"""
        if not is_s3_uri(s3_uri):
            raise ValueError('Invalid S3 URI: %r' % s3_uri)
        if not s3_uri.endswith('/'):
            s3_uri = s3_uri + '/'

        return s3_uri

    def _run(self):
        self._prepare_for_launch()

        self._launch_emr_job()
        self._wait_for_job_to_complete()

    def _prepare_for_launch(self):
        self._setup_input()
        self._create_wrapper_script()
        self._create_master_bootstrap_script()
        self._upload_non_input_files()

    def _setup_input(self):
        """Copy local input files (if any) to a special directory on S3.

        Set self._s3_input_uris
        Helper for _run
        """
        self._create_s3_temp_bucket_if_needed()
        # winnow out s3 files from local ones
        self._s3_input_uris = []
        local_input_paths = []
        for path in self._input_paths:
            if is_s3_uri(path):
                # Don't even bother running the job if the input isn't there,
                # since it's costly to spin up instances.
                if not self.path_exists(path):
                    raise AssertionError(
                        'Input path %s does not exist!' % (path,))
                self._s3_input_uris.append(path)
            else:
                local_input_paths.append(path)

        # copy local files into an input directory, with names like
        # 00000-actual_name.ext
        if local_input_paths:
            s3_input_dir = self._s3_tmp_uri + 'input/'
            log.info('Uploading input to %s' % s3_input_dir)

            s3_conn = self.make_s3_conn()
            for file_num, path in enumerate(local_input_paths):
                if path == '-':
                    path = self._dump_stdin_to_local_file()

                target = '%s%05d-%s' % (
                    s3_input_dir, file_num, os.path.basename(path))
                log.debug('uploading %s -> %s' % (path, target))
                s3_key = self.make_s3_key(target, s3_conn)
                s3_key.set_contents_from_filename(path)

            self._s3_input_uris.append(s3_input_dir)

    def _add_bootstrap_file(self, path):
        name, path = self._split_path(path)
        file_dict = {'path': path, 'name': name, 'bootstrap': 'file'}
        self._files.append(file_dict)
        return file_dict

    def _pick_s3_uris_for_files(self):
        """Decide where each file will be uploaded on S3.

        Okay to call this multiple times.
        """
        self._assign_unique_names_to_files(
            's3_uri', prefix=self._s3_tmp_uri + 'files/',
            match=is_s3_uri)

    def _upload_non_input_files(self):
        """Copy files to S3

        Pick S3 URIs for them if we haven't already."""
        self._create_s3_temp_bucket_if_needed()
        self._pick_s3_uris_for_files()

        s3_files_dir = self._s3_tmp_uri + 'files/'
        log.info('Copying non-input files into %s' % s3_files_dir)

        s3_conn = self.make_s3_conn()
        for file_dict in self._files:
            path = file_dict['path']

            # don't bother with files that are already on s3
            if is_s3_uri(path):
                continue

            s3_uri = file_dict['s3_uri']

            log.debug('uploading %s -> %s' % (path, s3_uri))
            s3_key = self.make_s3_key(s3_uri, s3_conn)
            s3_key.set_contents_from_filename(file_dict['path'])

    def setup_ssh_tunnel_to_job_tracker(self, host):
        """setup the ssh tunnel to the job tracker, if it's not currently
        running.

        Args:
        host -- hostname of the EMR master node.
        """
        REQUIRED_OPTS = ['ec2_key_pair', 'ec2_key_pair_file', 'ssh_bind_ports']
        for opt_name in REQUIRED_OPTS:
            if not self._opts[opt_name]:
                if not self._gave_cant_ssh_warning:
                    log.warning(
                        "You must set %s in order to ssh to the job tracker!" %
                        opt_name)
                    self._gave_cant_ssh_warning = True
                return

        # if there was already a tunnel, make sure it's still up
        if self._ssh_proc:
            self._ssh_proc.poll()
            if self._ssh_proc.returncode is None:
                return
            else:
                log.warning('Oops, ssh subprocess exited with return code %d,'
                            ' restarting...' % self._ssh_proc.returncode)
                self._ssh_proc = None

        log.info('Opening ssh tunnel to Hadoop job tracker')

        # if ssh detects that a host key has changed, it will silently not
        # open the tunnel, so make a fake empty known_hosts file and use that.
        # (you can actually use /dev/null as your known hosts file, but
        # that's UNIX-specific)
        fake_known_hosts_file = os.path.join(
            self._get_local_tmp_dir(), 'fake_ssh_known_hosts')
        # blank out the file, if it exists
        f = open(fake_known_hosts_file, 'w')
        f.close()
        log.debug('Created empty ssh known-hosts file: %s' % (
            fake_known_hosts_file,))

        bind_port = None
        for bind_port in self._pick_ssh_bind_ports():
            args = self._opts['ssh_bin'] + [
                '-o', 'VerifyHostKeyDNS=no',
                '-o', 'StrictHostKeyChecking=no',
                '-o', 'ExitOnForwardFailure=yes',
                '-o', 'UserKnownHostsFile=%s' % fake_known_hosts_file,
                '-L', '%d:localhost:%d' % (bind_port, EMR_JOB_TRACKER_PORT),
                '-N', '-q',  # no shell, no output
                '-i', self._opts['ec2_key_pair_file'],
            ]
            if self._opts['ssh_tunnel_is_open']:
                args.extend(['-g', '-4'])  # -4: listen on IPv4 only
            args.append('hadoop@' + host)
            log.debug('> %s' % cmd_line(args))

            ssh_proc = Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE)
            time.sleep(WAIT_FOR_SSH_TO_FAIL)
            ssh_proc.poll()
            # still running. We are golden
            if ssh_proc.returncode is None:
                self._ssh_proc = ssh_proc
                break

        if not self._ssh_proc:
            log.warning('Failed to open ssh tunnel to job tracker')
        else:
            if self._opts['ssh_tunnel_is_open']:
                bind_host = socket.getfqdn()
            else:
                bind_host = 'localhost'
            self._tracker_url = 'http://%s:%d%s' % (
                bind_host, bind_port, EMR_JOB_TRACKER_PATH)
            self._show_tracker_progress = True
            log.info('Connect to job tracker at: %s' % self._tracker_url)

    def _pick_ssh_bind_ports(self):
        """Pick a list of ports to try binding our SSH tunnel to.

        We will try to bind the same port for any given job flow (Issue #67)
        """
        # don't perturb the random number generator
        random_state = random.getstate()
        try:
            # seed random port selection on job flow ID
            random.seed(self._emr_job_flow_id)
            num_picks = min(MAX_SSH_RETRIES, len(self._opts['ssh_bind_ports']))
            return random.sample(self._opts['ssh_bind_ports'], num_picks)
        finally:
            random.setstate(random_state)

    def _enable_slave_ssh_access(self):
        if not self._ssh_key_name:
            self._ssh_key_name = self._job_name + '.pem'
            ssh_copy_key(
                self._opts['ssh_bin'],
                self._address_of_master(),
                self._opts['ec2_key_pair_file'],
                self._ssh_key_name)

    ### Running the job ###

    def cleanup(self, mode=None):
        super(EMRJobRunner, self).cleanup(mode=mode)

        # always stop our SSH tunnel if it's still running
        if self._ssh_proc:
            self._ssh_proc.poll()
            if self._ssh_proc.returncode is None:
                log.info('Killing our SSH tunnel (pid %d)' %
                         self._ssh_proc.pid)
                try:
                    os.kill(self._ssh_proc.pid, signal.SIGKILL)
                    self._ssh_proc = None
                except Exception, e:
                    log.exception(e)

        # stop the job flow if it belongs to us (it may have stopped on its
        # own already, but that's fine)
        # don't stop it if it was created due to --pool because the user
        # probably wants to use it again
        if self._emr_job_flow_id and not self._opts['emr_job_flow_id'] \
           and not self._opts['pool_emr_job_flows']:
            log.info('Terminating job flow: %s' % self._emr_job_flow_id)
            try:
                self.make_emr_conn().terminate_jobflow(self._emr_job_flow_id)
            except Exception, e:
                log.exception(e)

    def _cleanup_remote_scratch(self):
        # delete all the files we created
        if self._s3_tmp_uri:
            try:
                log.info('Removing all files in %s' % self._s3_tmp_uri)
                self.rm(self._s3_tmp_uri)
                self._s3_tmp_uri = None
            except Exception, e:
                log.exception(e)

    def _cleanup_logs(self):
        super(EMRJobRunner, self)._cleanup_logs()

        # delete the log files, if it's a job flow we created (the logs
        # belong to the job flow)
        if self._s3_job_log_uri and not self._opts['emr_job_flow_id'] \
           and not self._opts['pool_emr_job_flows']:
            try:
                log.info('Removing all files in %s' % self._s3_job_log_uri)
                self.rm(self._s3_job_log_uri)
                self._s3_job_log_uri = None
            except Exception, e:
                log.exception(e)

    def _wait_for_s3_eventual_consistency(self):
        """Sleep for a little while, to give S3 a chance to sync up.
        """
        log.info('Waiting %.1fs for S3 eventual consistency' %
                 self._opts['s3_sync_wait_time'])
        time.sleep(self._opts['s3_sync_wait_time'])

    def _wait_for_job_flow_termination(self):
        try:
            jobflow = self._describe_jobflow()
        except boto.exception.S3ResponseError:
            # mockboto throws this for some reason
            return
        if (jobflow.keepjobflowalivewhennosteps == 'true' and
            jobflow.state == 'WAITING'):
            raise Exception('Operation requires job flow to terminate, but'
                            ' it may never do so.')
        while jobflow.state not in ('TERMINATED', 'COMPLETED', 'FAILED',
                                    'SHUTTING_DOWN'):
            msg = 'Waiting for job flow to terminate (currently %s)' % \
                                                         jobflow.state
            log.info(msg)
            time.sleep(self._opts['check_emr_status_every'])
            jobflow = self._describe_jobflow()

    def _create_instance_group(self, role, instance_type, count, bid_price):
        """Helper method for creating instance groups. For use when
        creating a jobflow using a list of InstanceGroups, instead
        of the typical triumverate of
        num_instances/master_instance_type/slave_instance_type.

            - Role is either 'master', 'core', or 'task'.
            - instance_type is an EC2 instance type
            - count is an int
            - bid_price is a number, a string, or None. If None,
              this instance group will be use the ON-DEMAND market
              instead of the SPOT market.
        """

        if not instance_type:
            if self._opts['ec2_instance_type']:
                instance_type = self._opts['ec2_instance_type']
            else:
                raise ValueError('Missing instance type for %s node(s)'
                    % role)

        if bid_price:
            market = 'SPOT'
            bid_price = str(bid_price)  # must be a string
        else:
            market = 'ON_DEMAND'
            bid_price = None

        # Just name the groups "master", "task", and "core"
        name = role.lower()

        return boto_2_1_1_83aae37b.InstanceGroup(
            count, role, instance_type, market, name, bidprice=bid_price
            )

    def _create_job_flow(self, persistent=False, steps=None):
        """Create an empty job flow on EMR, and return the ID of that
        job.

        persistent -- if this is true, create the job flow with the --alive
            option, indicating the job will have to be manually terminated.
        """
        # make sure we can see the files we copied to S3
        self._wait_for_s3_eventual_consistency()

        # figure out local names and S3 URIs for our bootstrap actions, if any
        self._name_files()
        self._pick_s3_uris_for_files()

        log.info('Creating Elastic MapReduce job flow')
        args = self._job_flow_args(persistent, steps)

        emr_conn = self.make_emr_conn()
        log.debug('Calling run_jobflow(%r, %r, %s)' % (
            self._job_name, self._opts['s3_log_uri'],
            ', '.join('%s=%r' % (k, v) for k, v in args.iteritems())))
        emr_job_flow_id = emr_conn.run_jobflow(
            self._job_name, self._opts['s3_log_uri'], **args)

         # keep track of when we started our job
        self._emr_job_start = time.time()

        log.info('Job flow created with ID: %s' % emr_job_flow_id)
        return emr_job_flow_id

    def _job_flow_args(self, persistent=False, steps=None):
        """Build kwargs for emr_conn.run_jobflow()"""
        args = {}

        args['ami_version'] = self._opts['ami_version']
        args['hadoop_version'] = self._opts['hadoop_version']

        if self._opts['aws_availability_zone']:
            args['availability_zone'] = self._opts['aws_availability_zone']

        # The old, simple API, available if we're not using task instances
        # or bid prices
        if not (self._opts['num_ec2_task_instances'] or
                self._opts['ec2_core_instance_bid_price'] or
                self._opts['ec2_master_instance_bid_price'] or
                self._opts['ec2_task_instance_bid_price']):
            args['num_instances'] = self._opts['num_ec2_core_instances'] + 1
            args['master_instance_type'] = (
                self._opts['ec2_master_instance_type'])
            args['slave_instance_type'] = self._opts['ec2_core_instance_type']
        else:
            # Create a list of InstanceGroups
            args['instance_groups'] = [
                self._create_instance_group(
                    'MASTER',
                    self._opts['ec2_master_instance_type'],
                    1,
                    self._opts['ec2_master_instance_bid_price']
                    ),
            ]

            if self._opts['num_ec2_core_instances']:
                args['instance_groups'].append(
                    self._create_instance_group(
                        'CORE',
                        self._opts['ec2_core_instance_type'],
                        self._opts['num_ec2_core_instances'],
                        self._opts['ec2_core_instance_bid_price']
                    )
                )

            if self._opts['num_ec2_task_instances']:
                args['instance_groups'].append(
                    self._create_instance_group(
                        'TASK',
                        self._opts['ec2_task_instance_type'],
                        self._opts['num_ec2_task_instances'],
                        self._opts['ec2_task_instance_bid_price']
                        )
                    )

        # bootstrap actions
        bootstrap_action_args = []

        for file_dict in self._bootstrap_actions:
            # file_dict is not populated the same way by tools and real job
            # runs, so use s3_uri or path as appropriate
            s3_uri = file_dict.get('s3_uri', None) or file_dict['path']
            bootstrap_action_args.append(
                boto.emr.BootstrapAction(
                file_dict['name'], s3_uri, file_dict['args']))

        if self._master_bootstrap_script:
            master_bootstrap_script_args = []
            if self._opts['pool_emr_job_flows']:
                master_bootstrap_script_args = [
                    'pool-' + self._pool_hash(),
                    self._opts['emr_job_flow_pool_name'],
                ]
            bootstrap_action_args.append(
                boto.emr.BootstrapAction(
                    'master', self._master_bootstrap_script['s3_uri'],
                    master_bootstrap_script_args))

        if bootstrap_action_args:
            args['bootstrap_actions'] = bootstrap_action_args

        if self._opts['ec2_key_pair']:
            args['ec2_keyname'] = self._opts['ec2_key_pair']

        if self._opts['enable_emr_debugging']:
            args['enable_debugging'] = True

        if self._opts['additional_emr_info']:
            args['additional_info'] = self._opts['additional_emr_info']

        if persistent or self._opts['pool_emr_job_flows']:
            args['keep_alive'] = True

        if steps:
            args['steps'] = steps

        return args

    def _build_steps(self):
        """Return a list of boto Step objects corresponding to the
        steps we want to run."""
        assert self._script  # can't build steps if no script!

        # figure out local names for our files
        self._name_files()
        self._pick_s3_uris_for_files()

        # we're going to instruct EMR to upload the MR script and the
        # wrapper script (if any) to the job's local directory
        self._script['upload'] = 'file'
        if self._wrapper_script:
            self._wrapper_script['upload'] = 'file'

        # quick, add the other steps before the job spins up and
        # then shuts itself down (in practice this takes several minutes)
        steps = self._get_steps()
        step_list = []

        version = self.get_hadoop_version()

        for step_num, step in enumerate(steps):
            # EMR-specific stuff
            name = '%s: Step %d of %d' % (
                self._job_name, step_num + 1, len(steps))

            # don't terminate other people's job flows
            if (self._opts['emr_job_flow_id'] or
                self._opts['pool_emr_job_flows']):
                action_on_failure = 'CANCEL_AND_WAIT'
            else:
                action_on_failure = 'TERMINATE_JOB_FLOW'

            # Hadoop streaming stuff
            if 'M' not in step:  # if we have an identity mapper
                mapper = 'cat'
            else:
                mapper = cmd_line(self._mapper_args(step_num))

            if 'C' in step:
                combiner = cmd_line(self._combiner_args(step_num))
            else:
                combiner = None

            if 'R' in step:  # i.e. if there is a reducer:
                reducer = cmd_line(self._reducer_args(step_num))
            else:
                reducer = None

            input = self._s3_step_input_uris(step_num)
            output = self._s3_step_output_uri(step_num)\

            step_args, cache_files, cache_archives = self._cache_args()

            step_args.extend(self._hadoop_conf_args(step_num, len(steps)))
            jar = self._get_jar()

            if combiner is not None:
                if compat.supports_combiners_in_hadoop_streaming(version):
                    step_args.extend(['-combiner', combiner])
                else:
                    mapper = "bash -c '%s | sort | %s'" % (mapper, combiner)

            streaming_step = boto.emr.StreamingStep(
                name=name, mapper=mapper, reducer=reducer,
                action_on_failure=action_on_failure,
                cache_files=cache_files, cache_archives=cache_archives,
                step_args=step_args, input=input, output=output,
                jar=jar)

            step_list.append(streaming_step)

        return step_list

    def _cache_args(self):
        """Returns ``(step_args, cache_files, cache_archives)``, populating
        each according to the correct behavior for the current Hadoop version.

        For < 0.20, populate cache_files and cache_archives.
        For >= 0.20, populate step_args.

        step_args should be inserted into the step arguments before anything
            else.

        cache_files and cache_archives should be passed as arguments to
            StreamingStep.
        """
        version = self.get_hadoop_version()

        step_args = []
        cache_files = []
        cache_archives = []

        if compat.supports_new_distributed_cache_options(version):
            # boto doesn't support non-deprecated 0.20 options, so insert
            # them ourselves

            def escaped_paths(file_dicts):
                # return list of strings to join with commas and pass to the
                # hadoop binary
                return ["%s#%s" % (fd['s3_uri'], fd['name'])
                        for fd in file_dicts]

            # index by type
            all_files = {}
            for fd in self._files:
                all_files.setdefault(fd.get('upload'), []).append(fd)

            if 'file' in all_files:
                step_args.append('-files')
                step_args.append(','.join(escaped_paths(all_files['file'])))

            if 'archive' in all_files:
                step_args.append('-archives')
                step_args.append(','.join(escaped_paths(all_files['archive'])))
        else:
            for file_dict in self._files:
                if file_dict.get('upload') == 'file':
                    cache_files.append(
                        '%s#%s' % (file_dict['s3_uri'], file_dict['name']))
                elif file_dict.get('upload') == 'archive':
                    cache_archives.append(
                        '%s#%s' % (file_dict['s3_uri'], file_dict['name']))

        return step_args, cache_files, cache_archives

    def _get_jar(self):
        self._name_files()
        self._pick_s3_uris_for_files()

        if self._streaming_jar:
            return self._streaming_jar['s3_uri']
        else:
            return self._opts['hadoop_streaming_jar_on_emr']

    def _launch_emr_job(self):
        """Create an empty jobflow on EMR, and set self._emr_job_flow_id to
        the ID for that job."""
        self._create_s3_temp_bucket_if_needed()
        # define out steps
        steps = self._build_steps()

        # try to find a job flow from the pool. basically auto-fill
        # 'emr_job_flow_id' if possible and then follow normal behavior.
        if self._opts['pool_emr_job_flows']:
            job_flow = self.find_job_flow(num_steps=len(steps))
            if job_flow:
                self._emr_job_flow_id = job_flow.jobflowid

        # create a job flow if we're not already using an existing one
        if not self._emr_job_flow_id:
            self._emr_job_flow_id = self._create_job_flow(
                persistent=False, steps=steps)
        else:
            emr_conn = self.make_emr_conn()
            log.info('Adding our job to job flow %s' % self._emr_job_flow_id)
            log.debug('Calling add_jobflow_steps(%r, %r)' % (
                self._emr_job_flow_id, steps))
            emr_conn.add_jobflow_steps(self._emr_job_flow_id, steps)

        # keep track of when we launched our job
        self._emr_job_start = time.time()

    def _wait_for_job_to_complete(self):
        """Wait for the job to complete, and raise an exception if
        the job failed.

        Also grab log URI from the job status (since we may not know it)
        """
        success = False

        while True:
            # don't antagonize EMR's throttling
            log.debug('Waiting %.1f seconds...' %
                      self._opts['check_emr_status_every'])
            time.sleep(self._opts['check_emr_status_every'])

            job_flow = self._describe_jobflow()

            self._set_s3_job_log_uri(job_flow)

            job_state = job_flow.state
            reason = getattr(job_flow, 'laststatechangereason', '')

            # find all steps belonging to us, and get their state
            step_states = []
            running_step_name = ''
            total_step_time = 0.0
            step_nums = []  # step numbers belonging to us. 1-indexed

            steps = job_flow.steps or []
            for i, step in enumerate(steps):
                # ignore steps belonging to other jobs
                if not step.name.startswith(self._job_name):
                    continue

                step_nums.append(i + 1)

                step.state = step.state
                step_states.append(step.state)
                if step.state == 'RUNNING':
                    running_step_name = step.name

                if (hasattr(step, 'startdatetime') and
                    hasattr(step, 'enddatetime')):
                    start_time = iso8601_to_timestamp(step.startdatetime)
                    end_time = iso8601_to_timestamp(step.enddatetime)
                    total_step_time += end_time - start_time

            if not step_states:
                raise AssertionError("Can't find our steps in the job flow!")

            # if all our steps have completed, we're done!
            if all(state == 'COMPLETED' for state in step_states):
                success = True
                break

            # if any step fails, give up
            if any(state in ('FAILED', 'CANCELLED') for state in step_states):
                break

            # (the other step states are PENDING and RUNNING)

            # keep track of how long we've been waiting
            running_time = time.time() - self._emr_job_start

            # otherwise, we can print a status message
            if running_step_name:
                log.info('Job launched %.1fs ago, status %s: %s (%s)' %
                         (running_time, job_state, reason, running_step_name))
                if self._show_tracker_progress:
                    try:
                        tracker_handle = urllib2.urlopen(self._tracker_url)
                        tracker_page = ''.join(tracker_handle.readlines())
                        tracker_handle.close()
                        # first two formatted percentages, map then reduce
                        map_complete, reduce_complete = [float(complete)
                            for complete in JOB_TRACKER_RE.findall(
                                tracker_page)[:2]]
                        log.info(' map %3d%% reduce %3d%%' % (
                                 map_complete, reduce_complete))
                    except:
                        log.error('Unable to load progress from job tracker')
                        # turn off progress for rest of job
                        self._show_tracker_progress = False
                # once a step is running, it's safe to set up the ssh tunnel to
                # the job tracker
                job_host = getattr(job_flow, 'masterpublicdnsname', None)
                if job_host and self._opts['ssh_tunnel_to_job_tracker']:
                    self.setup_ssh_tunnel_to_job_tracker(job_host)

            # other states include STARTING and SHUTTING_DOWN
            elif reason:
                log.info('Job launched %.1fs ago, status %s: %s' %
                         (running_time, job_state, reason))
            else:
                log.info('Job launched %.1fs ago, status %s' %
                         (running_time, job_state,))

        if success:
            log.info('Job completed.')
            log.info('Running time was %.1fs (not counting time spent waiting'
                     ' for the EC2 instances)' % total_step_time)
            self._fetch_counters(step_nums)
            self.print_counters(range(1, len(step_nums) + 1))
        else:
            msg = 'Job failed with status %s: %s' % (job_state, reason)
            log.error(msg)
            if self._s3_job_log_uri:
                log.info('Logs are in %s' % self._s3_job_log_uri)
            # look for a Python traceback
            cause = self._find_probable_cause_of_failure(step_nums)
            if cause:
                # log cause, and put it in exception
                cause_msg = []  # lines to log and put in exception
                cause_msg.append('Probable cause of failure (from %s):' %
                           cause['log_file_uri'])
                cause_msg.extend(line.strip('\n') for line in cause['lines'])
                if cause['input_uri']:
                    cause_msg.append('(while reading from %s)' %
                                     cause['input_uri'])

                for line in cause_msg:
                    log.error(line)

                # add cause_msg to exception message
                msg += '\n' + '\n'.join(cause_msg) + '\n'

            raise Exception(msg)

    def _script_args(self):
        """How to invoke the script inside EMR"""
        # We can invoke the script by its S3 URL, but we don't really
        # gain anything from that, and EMR is touchy about distinguishing
        # python scripts from shell scripts

        assert self._script  # shouldn't call _script_args() if no script

        args = self._opts['python_bin'] + [self._script['name']]
        if self._wrapper_script:
            args = (self._opts['python_bin'] +
                    [self._wrapper_script['name']] +
                    args)

        return args

    def _mapper_args(self, step_num):
        return (self._script_args() +
                ['--step-num=%d' % step_num, '--mapper'] +
                self._mr_job_extra_args())

    def _reducer_args(self, step_num):
        return (self._script_args() +
                ['--step-num=%d' % step_num, '--reducer'] +
                self._mr_job_extra_args())

    def _combiner_args(self, step_num):
        return (self._script_args() +
                ['--step-num=%d' % step_num, '--combiner'] +
                self._mr_job_extra_args())

    def _upload_args(self):
        """Args to upload files from S3 to the local nodes that EMR runs
        on."""
        args = []
        for file_dict in self._files:
            if file_dict.get('upload') == 'file':
                args.append('--cache')
                args.append('%s#%s' % (file_dict['s3_uri'], file_dict['name']))
            elif file_dict.get('upload') == 'archive':
                args.append('--cache-archive')
                args.append('%s#%s' % (file_dict['s3_uri'], file_dict['name']))

        return args

    def _s3_step_input_uris(self, step_num):
        """Get the s3:// URIs for input for the given step."""
        if step_num == 0:
            return self._s3_input_uris
        else:
            # put intermediate data in HDFS
            return ['hdfs:///tmp/mrjob/%s/step-output/%s/' % (
                self._job_name, step_num)]

    def _s3_step_output_uri(self, step_num):
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
        else:
            # put intermediate data in HDFS
            return 'hdfs:///tmp/mrjob/%s/step-output/%s/' % (
                self._job_name, step_num + 1)

    ### LOG FETCHING/PARSING ###

    def _enforce_path_regexp(self, paths, regexp, step_nums=None):
        """Helper for log fetching functions to filter out unwanted
        logs. Only pass ``step_nums`` if ``regexp`` has a ``step_nums`` group.
        """
        for path in paths:
            m = regexp.match(path)
            if (m and
                (step_nums is None or
                 int(m.group('step_num')) in step_nums)):
                yield path
            else:
                log.debug('Ignore %s' % path)

    ## SSH LOG FETCHING

    def _ls_ssh_logs(self, relative_path):
        """List logs over SSH by path relative to log root directory"""
        full_path = SSH_PREFIX + SSH_LOG_ROOT + '/' + relative_path
        log.debug('Search %s for logs' % full_path)
        return self.ls(full_path)

    def _ls_slave_ssh_logs(self, addr, relative_path):
        """List logs over multi-hop SSH by path relative to log root directory
        """
        root_path = '%s%s!%s%s' % (SSH_PREFIX,
                                   self._address_of_master(),
                                   addr,
                                   SSH_LOG_ROOT + '/' + relative_path)
        log.debug('Search %s for logs' % root_path)
        return self.ls(root_path)

    def ls_task_attempt_logs_ssh(self, step_nums):
        all_paths = []
        try:
            all_paths.extend(self._ls_ssh_logs('userlogs/'))
        except IOError:
            # sometimes the master doesn't have these
            pass
        if not all_paths:
            # get them from the slaves instead (takes a little longer)
            try:
                for addr in self._addresses_of_slaves():
                    logs = self._ls_slave_ssh_logs(addr, 'userlogs/')
                    all_paths.extend(logs)
            except IOError:
                # sometimes the slaves don't have them either
                pass
        return self._enforce_path_regexp(all_paths,
                                         TASK_ATTEMPTS_LOG_URI_RE,
                                         step_nums)

    def ls_step_logs_ssh(self, step_nums):
        return self._enforce_path_regexp(self._ls_ssh_logs('steps/'),
                                         STEP_LOG_URI_RE,
                                         step_nums)

    def ls_job_logs_ssh(self, step_nums):
        return self._enforce_path_regexp(self._ls_ssh_logs('history/'),
                                         EMR_JOB_LOG_URI_RE,
                                         step_nums)

    def ls_node_logs_ssh(self):
        all_paths = []
        for addr in self._addresses_of_slaves():
            logs = self._ls_slave_ssh_logs(addr, '')
            all_paths.extend(logs)
        return self._enforce_path_regexp(all_paths, NODE_LOG_URI_RE)

    def ls_all_logs_ssh(self):
        """List all log files in the log root directory"""
        return self.ls(SSH_PREFIX + SSH_LOG_ROOT)

    ## S3 LOG FETCHING ##

    def _ls_s3_logs(self, relative_path):
        """List logs over S3 by path relative to log root directory"""
        if not self._s3_job_log_uri:
            self._set_s3_job_log_uri(self._describe_jobflow())

        if not self._s3_job_log_uri:
            raise LogFetchError('Could not determine S3 job log URI')

        full_path = self._s3_job_log_uri + relative_path
        log.debug('Search %s for logs' % full_path)
        return self.ls(full_path)

    def ls_task_attempt_logs_s3(self, step_nums):
        return self._enforce_path_regexp(self._ls_s3_logs('task-attempts/'),
                                         TASK_ATTEMPTS_LOG_URI_RE,
                                         step_nums)

    def ls_step_logs_s3(self, step_nums):
        return self._enforce_path_regexp(self._ls_s3_logs('steps/'),
                                         STEP_LOG_URI_RE,
                                         step_nums)

    def ls_job_logs_s3(self, step_nums):
        return  self._enforce_path_regexp(self._ls_s3_logs('jobs/'),
                                          EMR_JOB_LOG_URI_RE,
                                          step_nums)

    def ls_node_logs_s3(self):
        return self._enforce_path_regexp(self._ls_s3_logs('node/'),
                                         NODE_LOG_URI_RE)

    def ls_all_logs_s3(self):
        """List all log files in the S3 log root directory"""
        if not self._s3_job_log_uri:
            self._set_s3_job_log_uri(self._describe_jobflow())
        return self.ls(self._s3_job_log_uri)

    ## LOG PARSING ##

    def _fetch_counters(self, step_nums, skip_s3_wait=False):
        """Read Hadoop counters from S3.

        Args:
        step_nums -- the steps belonging to us, so that we can ignore counters
                     from other jobs when sharing a job flow
        """
        self._counters = []
        new_counters = {}
        if self._opts['ec2_key_pair_file']:
            try:
                new_counters = self._fetch_counters_ssh(step_nums)
            except LogFetchError:
                new_counters = self._fetch_counters_s3(step_nums, skip_s3_wait)
            except IOError:
                # Can get 'file not found' if test suite was lazy or Hadoop
                # logs moved. We shouldn't crash in either case.
                new_counters = self._fetch_counters_s3(step_nums, skip_s3_wait)
        else:
            log.info('ec2_key_pair_file not specified, going to S3')
            new_counters = self._fetch_counters_s3(step_nums, skip_s3_wait)

        # step_nums is relative to the start of the job flow
        # we only want them relative to the job
        for step_num in step_nums:
            self._counters.append(new_counters.get(step_num, {}))

    def _fetch_counters_ssh(self, step_nums):
        uris = list(self.ls_job_logs_ssh(step_nums))
        log.info('Fetching counters from SSH...')
        return scan_for_counters_in_files(uris, self,
                                          self.get_hadoop_version())

    def _fetch_counters_s3(self, step_nums, skip_s3_wait=False):
        job_flow = self._describe_jobflow()
        if job_flow.keepjobflowalivewhennosteps == 'true':
            log.info("Can't fetch counters from S3 for five more minutes. Try"
                     " 'python -m mrjob.tools.emr.fetch_logs --counters %s'"
                     " in five minutes." % job_flow.jobflowid)
            return {}

        log.info('Fetching counters from S3...')

        if not skip_s3_wait:
            self._wait_for_s3_eventual_consistency()
        self._wait_for_job_flow_termination()

        try:
            uris = self.ls_job_logs_s3(step_nums)
            return scan_for_counters_in_files(uris, self,
                                              self.get_hadoop_version())
        except LogFetchError, e:
            log.info("Unable to fetch counters: %s" % e)
            return {}

    def counters(self):
        return self._counters

    def _find_probable_cause_of_failure(self, step_nums):
        """Scan logs for Python exception tracebacks.

        Args:
        step_nums -- the numbers of steps belonging to us, so that we
            can ignore errors from other jobs when sharing a job flow

        Returns:
        None (nothing found) or a dictionary containing:
        lines -- lines in the log file containing the error message
        log_file_uri -- the log file containing the error message
        input_uri -- if the error happened in a mapper in the first
            step, the URI of the input file that caused the error
            (otherwise None)
        """
        if self._opts['ec2_key_pair_file']:
            try:
                return self._find_probable_cause_of_failure_ssh(step_nums)
            except LogFetchError:
                return self._find_probable_cause_of_failure_s3(step_nums)
        else:
            log.info('ec2_key_pair_file not specified, going to S3')
            return self._find_probable_cause_of_failure_s3(step_nums)

    def _find_probable_cause_of_failure_ssh(self, step_nums):
        task_attempt_logs = self.ls_task_attempt_logs_ssh(step_nums)
        step_logs = self.ls_step_logs_ssh(step_nums)
        job_logs = self.ls_job_logs_ssh(step_nums)
        log.info('Scanning SSH logs for probable cause of failure')
        return scan_logs_in_order(task_attempt_logs=task_attempt_logs,
                                  step_logs=step_logs,
                                  job_logs=job_logs,
                                  runner=self)

    def _find_probable_cause_of_failure_s3(self, step_nums):
        log.info('Scanning S3 logs for probable cause of failure')
        self._wait_for_s3_eventual_consistency()
        self._wait_for_job_flow_termination()

        task_attempt_logs = self.ls_task_attempt_logs_s3(step_nums)
        step_logs = self.ls_step_logs_s3(step_nums)
        job_logs = self.ls_job_logs_s3(step_nums)
        return scan_logs_in_order(task_attempt_logs=task_attempt_logs,
                                  step_logs=step_logs,
                                  job_logs=job_logs,
                                  runner=self)

    ### Bootstrapping ###

    def _create_master_bootstrap_script(self, dest='b.py'):
        """Create the master bootstrap script and write it into our local
        temp directory.

        This will do nothing if there are no bootstrap scripts or commands,
        or if _create_master_bootstrap_script() has already been called."""
        # we call the script b.py because there's a character limit on
        # bootstrap script names (or there was at one time, anyway)

        if not any(key.startswith('bootstrap_') and value
                   for (key, value) in self._opts.iteritems()):
            return

        # don't bother if we're not starting a job flow
        if self._opts['emr_job_flow_id']:
            return

        # Also don't bother if we're not pooling (and therefore don't need
        # to have a bootstrap script to attach to) and we're not bootstrapping
        # anything else
        if not (self._opts['pool_emr_job_flows'] or
            any(key.startswith('bootstrap_') and
                key != 'bootstrap_actions' and  # these are separate scripts
                value
                for (key, value) in self._opts.iteritems())):
            return

        if self._opts['bootstrap_mrjob']:
            if self._mrjob_tar_gz_file is None:
                self._mrjob_tar_gz_file = self._add_bootstrap_file(
                    self._create_mrjob_tar_gz() + '#')

        # need to know what files are called
        self._name_files()
        self._pick_s3_uris_for_files()

        path = os.path.join(self._get_local_tmp_dir(), dest)
        log.info('writing master bootstrap script to %s' % path)

        contents = self._master_bootstrap_script_content()
        for line in StringIO(contents):
            log.debug('BOOTSTRAP: ' + line.rstrip('\r\n'))

        f = open(path, 'w')
        f.write(contents)
        f.close()

        name, _ = self._split_path(path)
        self._master_bootstrap_script = {'path': path, 'name': name}
        self._files.append(self._master_bootstrap_script)

    def _master_bootstrap_script_content(self):
        """Create the contents of the master bootstrap script.

        This will give names and S3 URIs to files that don't already have them.

        This function does NOT pick S3 URIs for files or anything like
        that; _create_master_bootstrap_script() is responsible for that.
        """
        out = StringIO()

        python_bin_in_list = ', '.join(repr(opt) for opt in self._opts['python_bin'])

        def writeln(line=''):
            out.write(line + '\n')

        # shebang
        writeln('#!/usr/bin/python')
        writeln()

        # imports
        writeln('from __future__ import with_statement')
        writeln()
        writeln('import distutils.sysconfig')
        writeln('import os')
        writeln('import stat')
        writeln('from subprocess import call, check_call')
        writeln('from tempfile import mkstemp')
        writeln('from xml.etree.ElementTree import ElementTree')
        writeln()

        # download files using hadoop fs
        writeln('# download files using hadoop fs -copyToLocal')
        for file_dict in self._files:
            if file_dict.get('bootstrap'):
                writeln(
                    "check_call(['hadoop', 'fs', '-copyToLocal', %r, %r])" %
                    (file_dict['s3_uri'], file_dict['name']))
        writeln()

        # make scripts executable
        if self._bootstrap_scripts:
            writeln('# make bootstrap scripts executable')
            for file_dict in self._bootstrap_scripts:
                writeln("check_call(['chmod', 'a+rx', %r])" %
                        file_dict['name'])
            writeln()

        # bootstrap mrjob
        if self._opts['bootstrap_mrjob']:
            writeln('# bootstrap mrjob')
            writeln("site_packages = distutils.sysconfig.get_python_lib()")
            writeln(
                "check_call(['sudo', 'tar', 'xfz', %r, '-C', site_packages])" %
                self._mrjob_tar_gz_file['name'])
            # re-compile pyc files now, since mappers/reducers can't
            # write to this directory. Don't fail if there is extra
            # un-compileable crud in the tarball.
            writeln("mrjob_dir = os.path.join(site_packages, 'mrjob')")
            writeln("call(["
                    "'sudo', %s, '-m', 'compileall', '-f', mrjob_dir])" %
                    python_bin_in_list)
            writeln()

        # install our python modules
        if self._bootstrap_python_packages:
            writeln('# install python modules:')
            for file_dict in self._bootstrap_python_packages:
                writeln("check_call(['tar', 'xfz', %r])" %
                        file_dict['name'])
                # figure out name of dir to CD into
                assert file_dict['path'].endswith('.tar.gz')
                cd_into = extract_dir_for_tar(file_dict['path'])
                # install the module
                writeln("check_call(["
                        "'sudo', %s, 'setup.py', 'install'], cwd=%r)" %
                        (python_bin_in_list, cd_into))

        # run our commands
        if self._opts['bootstrap_cmds']:
            writeln('# run bootstrap cmds:')
            for cmd in self._opts['bootstrap_cmds']:
                if isinstance(cmd, basestring):
                    writeln('check_call(%r, shell=True)' % cmd)
                else:
                    writeln('check_call(%r)' % cmd)
            writeln()

        # run our scripts
        if self._bootstrap_scripts:
            writeln('# run bootstrap scripts:')
            for file_dict in self._bootstrap_scripts:
                writeln('check_call(%r)' % (
                    ['./' + file_dict['name']],))
            writeln()

        return out.getvalue()

    ### EMR JOB MANAGEMENT UTILS ###

    def make_persistent_job_flow(self):
        """Create a new EMR job flow that requires manual termination, and
        return its ID.

        You can also fetch the job ID by calling self.get_emr_job_flow_id()
        """
        if (self._emr_job_flow_id):
            raise AssertionError(
                'This runner is already associated with job flow ID %s' %
                (self._emr_job_flow_id))

        log.info('Creating persistent job flow to run several jobs in...')

        self._create_master_bootstrap_script()
        self._upload_non_input_files()

        # don't allow user to call run()
        self._ran_job = True

        self._emr_job_flow_id = self._create_job_flow(persistent=True)

        return self._emr_job_flow_id

    def get_emr_job_flow_id(self):
        return self._emr_job_flow_id

    def usable_job_flows(self, emr_conn=None, exclude=None, num_steps=1):
        """Get job flows that this runner can use.

        We basically expect to only join available job flows with the exact
        same setup as our own, that is:

        - same bootstrap setup (including mrjob version)
        - have the same Hadoop and AMI version
        - same number and type of instances

        However, we allow joining job flows where for each role, every instance
        has at least as much memory as we require, and the total number of
        compute units is at least what we require.

        There also must be room for our job in the job flow (job flows top out
        at 256 steps).

        We then sort by:
        - total compute units for core + task nodes
        - total compute units for master node
        - time left to an even instance hour

        The most desirable job flows come *last* in the list.

        :return: list of (job_minutes_float,
                 :py:class:`botoemr.emrobject.JobFlow`)
        """
        emr_conn = emr_conn or self.make_emr_conn()
        exclude = exclude or set()

        req_hash = self._pool_hash()

        # decide memory and total compute units requested for each
        # role type
        role_to_req_instance_type = {}
        role_to_req_num_instances = {}
        role_to_req_mem = {}
        role_to_req_cu = {}
        role_to_req_bid_price = {}

        for role in ('core', 'master', 'task'):
            instance_type = self._opts['ec2_%s_instance_type' % role]
            if role == 'master':
                num_instances = 1
            else:
                num_instances = self._opts['num_ec2_%s_instances' % role]

            role_to_req_instance_type[role] = instance_type
            role_to_req_num_instances[role] = num_instances

            role_to_req_bid_price[role] = (
                self._opts['ec2_%s_instance_bid_price' % role])

            # unknown instance types can only match themselves
            role_to_req_mem[role] = (
                EC2_INSTANCE_TYPE_TO_MEMORY.get(instance_type, float('Inf')))
            role_to_req_cu[role] = (
                num_instances *
                EC2_INSTANCE_TYPE_TO_COMPUTE_UNITS.get(instance_type,
                                                       float('Inf')))

        sort_keys_and_job_flows = []

        def add_if_match(job_flow):
            # this may be a retry due to locked job flows
            if job_flow.jobflowid in exclude:
                return

            # only take persistent job flows
            if job_flow.keepjobflowalivewhennosteps != 'true':
                return

            # match pool name, and (bootstrap) hash
            hash, name = pool_hash_and_name(job_flow)
            if req_hash != hash:
                return

            if self._opts['emr_job_flow_pool_name'] != name:
                return

            # match hadoop version
            if job_flow.hadoopversion != self.get_hadoop_version():
                return

            # match AMI version
            job_flow_ami_version = getattr(job_flow, 'amiversion', None)
            if job_flow_ami_version != self._opts['ami_version']:
                return

            # there is a hard limit of 256 steps per job flow
            if len(job_flow.steps) + num_steps > MAX_STEPS_PER_JOB_FLOW:
                return

            # in rare cases, job flow can be WAITING *and* have incomplete
            # steps
            if any(getattr(step, 'enddatetime', None) is None
                   for step in job_flow.steps):
                return

            # total compute units per group
            role_to_cu = defaultdict(float)
            # total number of instances of the same type in each group.
            # This allows us to match unknown instance types.
            role_to_matched_instances = defaultdict(int)

            # check memory and compute units, bailing out if we hit
            # an instance with too little memory
            for ig in job_flow.instancegroups:
                role = ig.instancerole.lower()

                # unknown, new kind of role; bail out!
                if role not in ('core', 'master', 'task'):
                    return

                req_instance_type = role_to_req_instance_type[role]
                if ig.instancetype != req_instance_type:
                    # if too little memory, bail out
                    mem = EC2_INSTANCE_TYPE_TO_MEMORY.get(ig.instancetype, 0.0)
                    req_mem = role_to_req_mem.get(role, 0.0)
                    if mem < req_mem:
                        return

                # if bid price is too low, don't count compute units
                req_bid_price = role_to_req_bid_price[role]
                bid_price = getattr(ig, 'bidprice', None)

                # if the instance is on-demand (no bid price) or bid prices
                # are the same, we're okay
                if bid_price and bid_price != req_bid_price:
                    # whoops, we didn't want spot instances at all
                    if not req_bid_price:
                        continue

                    try:
                        if float(req_bid_price) > float(bid_price):
                            continue
                    except ValueError:
                        # we don't know what to do with non-float bid prices,
                        # and we know it's not equal to what we requested
                        continue

                # don't require instances to be running; we'd be worse off if
                # we started our own job flow from scratch. (This can happen if
                # the previous job finished while some task instances were
                # still being provisioned.)
                cu = (int(ig.instancerequestcount) *
                      EC2_INSTANCE_TYPE_TO_COMPUTE_UNITS.get(
                          ig.instancetype, 0.0))
                role_to_cu.setdefault(role, 0.0)
                role_to_cu[role] += cu

                # track number of instances of the same type
                if ig.instancetype == req_instance_type:
                    role_to_matched_instances[role] += (
                        int(ig.instancerequestcount))

            # check if there are enough compute units
            for role, req_cu in role_to_req_cu.iteritems():
                req_num_instances = role_to_req_num_instances[role]
                # if we have at least as many units of the right type,
                # don't bother counting compute units
                if req_num_instances > role_to_matched_instances[role]:
                    cu = role_to_cu.get(role, 0.0)
                    if cu < req_cu:
                        return

            # make a sort key
            sort_key = (role_to_cu['core'] + role_to_cu['task'],
                        role_to_cu['master'],
                        est_time_to_hour(job_flow))

            sort_keys_and_job_flows.append((sort_key, job_flow))

        for job_flow in emr_conn.describe_jobflows(states=['WAITING']):
            add_if_match(job_flow)

        return [job_flow for (sort_key, job_flow)
                in sorted(sort_keys_and_job_flows)]

    def find_job_flow(self, num_steps=1):
        """Find a job flow that can host this runner. Prefer flows with more
        compute units. Break ties by choosing flow with longest idle time.
        Return ``None`` if no suitable flows exist.
        """
        chosen_job_flow = None
        exclude = set()
        emr_conn = self.make_emr_conn()
        s3_conn = self.make_s3_conn()
        while chosen_job_flow is None:
            sorted_tagged_job_flows = self.usable_job_flows(
                emr_conn=emr_conn,
                exclude=exclude,
                num_steps=num_steps)
            if sorted_tagged_job_flows:
                job_flow = sorted_tagged_job_flows[-1]
                status = attempt_to_acquire_lock(
                    s3_conn, self._lock_uri(job_flow),
                    self._opts['s3_sync_wait_time'], self._job_name)
                if status:
                    return sorted_tagged_job_flows[-1]
                else:
                    exclude.add(job_flow.jobflowid)
            else:
                return None

    def _lock_uri(self, job_flow):
        return make_lock_uri(self._opts['s3_scratch_uri'],
                             job_flow.jobflowid,
                             len(job_flow.steps) + 1)

    def _pool_hash(self):
        """Generate a hash of the bootstrap configuration so it can be used to
        match jobs and job flows. This first argument passed to the bootstrap
        script will be ``'pool-'`` plus this hash.
        """
        def should_include_file(info):
            # Bootstrap scripts will always have a different checksum
            if 'name' in info and info['name'] in ('b.py', 'wrapper.py'):
                return False

            # Also do not include script used to spin up job
            if self._script and info['path'] == self._script['path']:
                return False

            # Only include bootstrap files
            if 'bootstrap' not in info:
                return False

            # mrjob.tar.gz is covered by the bootstrap_mrjob variable.
            # also, it seems to be different every time, causing an
            # undesirable hash mismatch.
            if (self._opts['bootstrap_mrjob']
                and info is self._mrjob_tar_gz_file):
                return False

            # Ignore job-specific files
            if info['path'] in self._input_paths:
                return False

            return True

        # strip unique s3 URI if there is one
        cleaned_bootstrap_actions = [dict(path=fd['path'], args=fd['args'])
                                     for fd in self._bootstrap_actions]

        things_to_hash = [
            [self.md5sum(fd['path'])
             for fd in self._files if should_include_file(fd)],
            self._opts['additional_emr_info'],
            self._opts['bootstrap_mrjob'],
            self._opts['bootstrap_cmds'],
            cleaned_bootstrap_actions,
        ]
        if self._opts['bootstrap_mrjob']:
            things_to_hash.append(mrjob.__version__)
        return hash_object(things_to_hash)

    ### GENERAL FILESYSTEM STUFF ###

    def du(self, path_glob):
        """Get the size of all files matching path_glob."""
        if not is_s3_uri(path_glob):
            return super(EMRJobRunner, self).du(path_glob)

        return sum(self.get_s3_key(uri).size for uri in self.ls(path_glob))

    def ls(self, path_glob):
        """Recursively list files locally or on S3.

        This doesn't list "directories" unless there's actually a
        corresponding key ending with a '/' (which is weird and confusing;
        don't make S3 keys ending in '/')

        To list a directory, path_glob must end with a trailing
        slash (foo and foo/ are different on S3)
        """
        if SSH_URI_RE.match(path_glob):
            for item in self._ssh_ls(path_glob):
                yield item
            return

        if not is_s3_uri(path_glob):
            for path in super(EMRJobRunner, self).ls(path_glob):
                yield path
            return

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # if it's a "file" (doesn't end with /), just check if it exists
        if not glob_match and not path_glob.endswith('/'):
            uri = path_glob
            if self.get_s3_key(uri):
                yield uri
            return

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        for uri in self._s3_ls(base_uri):
            # enforce globbing
            if glob_match and not fnmatch.fnmatchcase(uri, path_glob):
                continue

            yield uri

    def _ssh_ls(self, uri):
        """Helper for ls(); obeys globbing"""
        m = SSH_URI_RE.match(uri)
        try:
            addr = m.group('hostname') or self._address_of_master()
            if '!' in addr:
                self._enable_slave_ssh_access()
            output = ssh_ls(
                self._opts['ssh_bin'],
                addr,
                self._opts['ec2_key_pair_file'],
                m.group('filesystem_path'),
                self._ssh_key_name,
            )
            for line in output:
                # skip directories, we only want to return downloadable files
                if line and not line.endswith('/'):
                    yield SSH_PREFIX + addr + line
        except SSHException, e:
            raise LogFetchError(e)

    def _s3_ls(self, uri):
        """Helper for ls(); doesn't bother with globbing or directories"""
        s3_conn = self.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(uri)

        bucket = s3_conn.get_bucket(bucket_name)
        for key in bucket.list(key_name):
            yield s3_key_to_uri(key)

    def md5sum(self, path, s3_conn=None):
        if is_s3_uri(path):
            k = self.get_s3_key(path, s3_conn=s3_conn)
            return k.etag.strip('"')
        else:
            return super(EMRJobRunner, self).md5sum(path)

    def _cat_file(self, filename):
        ssh_match = SSH_URI_RE.match(filename)
        if is_s3_uri(filename):
            # stream lines from the s3 key
            s3_key = self.get_s3_key(filename)
            buffer_iterator = read_file(s3_key_to_uri(s3_key), fileobj=s3_key)
            return buffer_iterator_to_line_iterator(buffer_iterator)
        elif ssh_match:
            try:
                addr = ssh_match.group('hostname') or self._address_of_master()
                if '!' in addr:
                    self._enable_slave_ssh_access()
                output = ssh_cat(
                    self._opts['ssh_bin'],
                    addr,
                    self._opts['ec2_key_pair_file'],
                    ssh_match.group('filesystem_path'),
                    self._ssh_key_name,
                )
                return read_file(filename, fileobj=StringIO(output))
            except SSHException, e:
                raise LogFetchError(e)
        else:
            # read from local filesystem
            return super(EMRJobRunner, self)._cat_file(filename)

    def mkdir(self, dest):
        """Make a directory. This does nothing on S3 because there are
        no directories.
        """
        if not is_s3_uri(dest):
            super(EMRJobRunner, self).mkdir(dest)

    def path_exists(self, path_glob):
        """Does the given path exist?

        If dest is a directory (ends with a "/"), we check if there are
        any files starting with that path.
        """
        if not is_s3_uri(path_glob):
            return super(EMRJobRunner, self).path_exists(path_glob)

        # just fall back on ls(); it's smart
        return any(self.ls(path_glob))

    def path_join(self, dirname, filename):
        if is_s3_uri(dirname):
            return posixpath.join(dirname, filename)
        else:
            return os.path.join(dirname, filename)

    def rm(self, path_glob):
        """Remove all files matching the given glob."""
        if not is_s3_uri(path_glob):
            return super(EMRJobRunner, self).rm(path_glob)

        s3_conn = self.make_s3_conn()
        for uri in self.ls(path_glob):
            key = self.get_s3_key(uri, s3_conn)
            if key:
                log.debug('deleting ' + uri)
                key.delete()

            # special case: when deleting a directory, also clean up
            # the _$folder$ files that EMR creates.
            if uri.endswith('/'):
                folder_uri = uri[:-1] + '_$folder$'
                folder_key = self.get_s3_key(folder_uri, s3_conn)
                if folder_key:
                    log.debug('deleting ' + folder_uri)
                    folder_key.delete()

    def touchz(self, dest):
        """Make an empty file in the given location. Raises an error if
        a non-empty file already exists in that location."""
        if not is_s3_uri(dest):
            super(EMRJobRunner, self).touchz(dest)

        key = self.get_s3_key(dest)
        if key and key.size != 0:
            raise OSError('Non-empty file %r already exists!' % (dest,))

        self.make_s3_key(dest).set_contents_from_string('')

    ### EMR-specific STUFF ###

    def _wrap_aws_conn(self, raw_conn):
        """Wrap a given boto Connection object so that it can retry when
        throttled."""
        def retry_if(ex):
            """Retry if we get a server error indicating throttling. Also
            handle spurious 505s that are thought to be part of a load
            balancer issue inside AWS."""
            return ((isinstance(ex, boto.exception.BotoServerError) and
                     ('Throttling' in ex.body or
                      'RequestExpired' in ex.body or
                      ex.status == 505)) or
                    (isinstance(ex, socket.error) and
                     ex.args in ((104, 'Connection reset by peer'),
                                 (110, 'Connection timed out'))))

        return RetryWrapper(raw_conn,
                            retry_if=retry_if,
                            backoff=EMR_BACKOFF,
                            multiplier=EMR_BACKOFF_MULTIPLIER,
                            max_tries=EMR_MAX_TRIES)

    def make_emr_conn(self):
        """Create a connection to EMR.

        :return: a :py:class:`mrjob.boto_2_1_1_83aae37b.EmrConnection`, a
                 subclass of :py:class:`boto.emr.connection.EmrConnection`,
                 wrapped in a :py:class:`mrjob.retry.RetryWrapper`
        """
        # ...which is then wrapped in bacon! Mmmmm!

        # give a non-cryptic error message if boto isn't installed
        if boto is None:
            raise ImportError('You must install boto to connect to EMR')

        region = self._get_region_info_for_emr_conn()
        log.debug('creating EMR connection (to %s)' % region.endpoint)

        raw_emr_conn = boto_2_1_1_83aae37b.EmrConnection(
            aws_access_key_id=self._opts['aws_access_key_id'],
            aws_secret_access_key=self._opts['aws_secret_access_key'],
            region=region)
        return self._wrap_aws_conn(raw_emr_conn)

    def _get_region_info_for_emr_conn(self):
        """Get a :py:class:`boto.ec2.regioninfo.RegionInfo` object to
        initialize EMR connections with.

        This is kind of silly because all
        :py:class:`boto.emr.connection.EmrConnection` ever does with
        this object is extract the hostname, but that's how boto rolls.
        """
        if self._opts['emr_endpoint']:
            endpoint = self._opts['emr_endpoint']
        else:
            # look up endpoint in our table
            try:
                endpoint = REGION_TO_EMR_ENDPOINT[self._aws_region]
            except KeyError:
                raise Exception(
                    "Don't know the EMR endpoint for %s;"
                    " try setting emr_endpoint explicitly" % self._aws_region)

        return boto.ec2.regioninfo.RegionInfo(None, self._aws_region, endpoint)

    def _describe_jobflow(self, emr_conn=None):
        emr_conn = emr_conn or self.make_emr_conn()
        return emr_conn.describe_jobflow(self._emr_job_flow_id)

    def get_hadoop_version(self):
        if not self._inferred_hadoop_version:
            if self._emr_job_flow_id:
                # if joining a job flow, infer the version
                self._inferred_hadoop_version = (
                    self._describe_jobflow().hadoopversion)
            else:
                # otherwise, read it from hadoop_version/ami_version
                hadoop_version = self._opts['hadoop_version']
                if hadoop_version:
                    self._inferred_hadoop_version = hadoop_version
                else:
                    ami_version = self._opts['ami_version']
                    # don't explode if we see an AMI version that's
                    # newer than what we know about.
                    self._inferred_hadoop_version = (
                        AMI_VERSION_TO_HADOOP_VERSION.get(ami_version) or
                        AMI_VERSION_TO_HADOOP_VERSION['latest'])

        return self._inferred_hadoop_version

    def _address_of_master(self, emr_conn=None):
        """Get the address of the master node so we can SSH to it"""
        # cache address of master to avoid redundant calls to describe_jobflow
        # also convenient for testing (pretend we can SSH when we really can't
        # by setting this to something not False)
        if self._address:
            return self._address

        try:
            jobflow = self._describe_jobflow(emr_conn)
            if jobflow.state not in ('WAITING', 'RUNNING'):
                raise LogFetchError(
                    'Cannot ssh to master; job flow is not waiting or running')
        except boto.exception.S3ResponseError:
            # This error is raised by mockboto when the jobflow doesn't exist
            raise LogFetchError('Could not get job flow information')

        self._address = jobflow.masterpublicdnsname
        return self._address

    def _addresses_of_slaves(self):
        if not self._ssh_slave_addrs:
            self._ssh_slave_addrs = ssh_slave_addresses(
                self._opts['ssh_bin'],
                self._address_of_master(),
                self._opts['ec2_key_pair_file'])
        return self._ssh_slave_addrs

    ### S3-specific FILESYSTEM STUFF ###

    # Utilities for interacting with S3 using S3 URIs.

    # Try to use the more general filesystem interface unless you really
    # need to do something S3-specific (e.g. setting file permissions)

    def make_s3_conn(self):
        """Create a connection to S3.

        :return: a :py:class:`boto.s3.connection.S3Connection`, wrapped in a
                 :py:class:`mrjob.retry.RetryWrapper`
        """
        # give a non-cryptic error message if boto isn't installed
        if boto is None:
            raise ImportError('You must install boto to connect to S3')

        s3_endpoint = self._get_s3_endpoint()
        log.debug('creating S3 connection (to %s)' % s3_endpoint)

        raw_s3_conn = boto.connect_s3(
            aws_access_key_id=self._opts['aws_access_key_id'],
            aws_secret_access_key=self._opts['aws_secret_access_key'],
            host=s3_endpoint)
        return self._wrap_aws_conn(raw_s3_conn)

    def _get_s3_endpoint(self):
        if self._opts['s3_endpoint']:
            return self._opts['s3_endpoint']
        else:
            # look it up in our table
            try:
                return REGION_TO_S3_ENDPOINT[self._aws_region]
            except KeyError:
                raise Exception(
                    "Don't know the S3 endpoint for %s;"
                    " try setting s3_endpoint explicitly" % self._aws_region)

    def get_s3_key(self, uri, s3_conn=None):
        """Get the boto Key object matching the given S3 uri, or
        return None if that key doesn't exist.

        uri is an S3 URI: ``s3://foo/bar``

        You may optionally pass in an existing s3 connection through
        ``s3_conn``.
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(uri)

        return s3_conn.get_bucket(bucket_name).get_key(key_name)

    def make_s3_key(self, uri, s3_conn=None):
        """Create the given S3 key, and return the corresponding
        boto Key object.

        uri is an S3 URI: ``s3://foo/bar``

        You may optionally pass in an existing S3 connection through
        ``s3_conn``.
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(uri)

        return s3_conn.get_bucket(bucket_name).new_key(key_name)

    def get_s3_keys(self, uri, s3_conn=None):
        """Get a stream of boto Key objects for each key inside
        the given dir on S3.

        uri is an S3 URI: ``s3://foo/bar``

        You may optionally pass in an existing S3 connection through s3_conn
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()

        bucket_name, key_prefix = parse_s3_uri(uri)
        bucket = s3_conn.get_bucket(bucket_name)
        for key in bucket.list(key_prefix):
            yield key

    def get_s3_folder_keys(self, uri, s3_conn=None):
        """Background: S3 is even less of a filesystem than HDFS in that it
        doesn't have directories. EMR fakes directories by creating special
        ``*_$folder$`` keys in S3.

        For example if your job outputs ``s3://walrus/tmp/output/part-00000``,
        EMR will also create these keys:

        - ``s3://walrus/tmp_$folder$``
        - ``s3://walrus/tmp/output_$folder$``

        If you want to grant another Amazon user access to your files so they
        can use them in S3, you must grant read access on the actual keys,
        plus any ``*_$folder$`` keys that "contain" your keys; otherwise
        EMR will error out with a permissions error.

        This gets all the ``*_$folder$`` keys associated with the given URI,
        as boto Key objects.

        This does not support globbing.

        You may optionally pass in an existing S3 connection through
        ``s3_conn``.
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()

        bucket_name, key_name = parse_s3_uri(uri)
        bucket = s3_conn.get_bucket(bucket_name)

        dirs = key_name.split('/')
        for i in range(len(dirs)):
            folder_name = '/'.join(dirs[:i]) + '_$folder$'
            key = bucket.get_key(folder_name)
            if key:
                yield key