/usr/share/pyshared/mrjob/emr.py is in python-mrjob 0.3.3.2-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 | # Copyright 2009-2012 Yelp and Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import with_statement
from collections import defaultdict
from datetime import datetime
from datetime import timedelta
import fnmatch
import logging
import os
import posixpath
import random
import re
import shlex
import signal
import socket
from subprocess import Popen
from subprocess import PIPE
import time
import urllib2
try:
from cStringIO import StringIO
StringIO # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
from StringIO import StringIO
try:
import simplejson as json # preferred because of C speedups
json # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
import json # built in to Python 2.6 and later
try:
import boto
import boto.ec2
import boto.emr
import boto.exception
import boto.utils
from mrjob import boto_2_1_1_83aae37b
boto # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
# don't require boto; MRJobs don't actually need it when running
# inside hadoop streaming
boto = None
import mrjob
from mrjob import compat
from mrjob.conf import combine_cmds
from mrjob.conf import combine_dicts
from mrjob.conf import combine_lists
from mrjob.conf import combine_paths
from mrjob.conf import combine_path_lists
from mrjob.logparsers import TASK_ATTEMPTS_LOG_URI_RE
from mrjob.logparsers import STEP_LOG_URI_RE
from mrjob.logparsers import EMR_JOB_LOG_URI_RE
from mrjob.logparsers import NODE_LOG_URI_RE
from mrjob.logparsers import scan_for_counters_in_files
from mrjob.logparsers import scan_logs_in_order
from mrjob.parse import is_s3_uri
from mrjob.parse import parse_s3_uri
from mrjob.pool import est_time_to_hour
from mrjob.pool import pool_hash_and_name
from mrjob.retry import RetryWrapper
from mrjob.runner import MRJobRunner
from mrjob.runner import GLOB_RE
from mrjob.ssh import ssh_cat
from mrjob.ssh import ssh_ls
from mrjob.ssh import ssh_copy_key
from mrjob.ssh import ssh_slave_addresses
from mrjob.ssh import SSHException
from mrjob.ssh import SSH_PREFIX
from mrjob.ssh import SSH_LOG_ROOT
from mrjob.ssh import SSH_URI_RE
from mrjob.util import buffer_iterator_to_line_iterator
from mrjob.util import cmd_line
from mrjob.util import extract_dir_for_tar
from mrjob.util import hash_object
from mrjob.util import read_file
log = logging.getLogger('mrjob.emr')
JOB_TRACKER_RE = re.compile('(\d{1,3}\.\d{2})%')
# if EMR throttles us, how long to wait (in seconds) before trying again?
EMR_BACKOFF = 20
EMR_BACKOFF_MULTIPLIER = 1.5
EMR_MAX_TRIES = 20 # this takes about a day before we run out of tries
# the port to tunnel to
EMR_JOB_TRACKER_PORT = 9100
EMR_JOB_TRACKER_PATH = '/jobtracker.jsp'
MAX_SSH_RETRIES = 20
# ssh should fail right away if it can't bind a port
WAIT_FOR_SSH_TO_FAIL = 1.0
# sometimes AWS gives us seconds as a decimal, which we can't parse
# with boto.utils.ISO8601
SUBSECOND_RE = re.compile('\.[0-9]+')
# map from AWS region to EMR endpoint. See
# http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?ConceptsRequestEndpoints.html
REGION_TO_EMR_ENDPOINT = {
'EU': 'eu-west-1.elasticmapreduce.amazonaws.com',
'us-east-1': 'us-east-1.elasticmapreduce.amazonaws.com',
'us-west-1': 'us-west-1.elasticmapreduce.amazonaws.com',
'': 'elasticmapreduce.amazonaws.com', # when no region specified
}
# map from AWS region to S3 endpoint. See
# http://docs.amazonwebservices.com/AmazonS3/latest/dev/MakingRequests.html#RequestEndpoints
REGION_TO_S3_ENDPOINT = {
'EU': 's3-eu-west-1.amazonaws.com',
'us-east-1': 's3.amazonaws.com', # no region-specific endpoint
'us-west-1': 's3-us-west-1.amazonaws.com',
'ap-southeast-1': 's3-ap-southeast-1.amazonaws.com', # no EMR endpoint yet
'': 's3.amazonaws.com',
}
# map from AWS region to S3 LocationConstraint parameter for regions whose
# location constraints differ from their AWS regions. See
# http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?RESTBucketPUT.html
REGION_TO_S3_LOCATION_CONSTRAINT = {
'us-east-1': '',
}
# map from instance type to number of compute units
# from http://aws.amazon.com/ec2/instance-types/
EC2_INSTANCE_TYPE_TO_COMPUTE_UNITS = {
't1.micro': 2,
'm1.small': 1,
'm1.large': 4,
'm1.xlarge': 8,
'm2.xlarge': 6.5,
'm2.2xlarge': 13,
'm2.4xlarge': 26,
'c1.medium': 5,
'c1.xlarge': 20,
'cc1.4xlarge': 33.5,
'cg1.4xlarge': 33.5,
}
# map from instance type to GB of memory
# from http://aws.amazon.com/ec2/instance-types/
EC2_INSTANCE_TYPE_TO_MEMORY = {
't1.micro': 0.6,
'm1.small': 1.7,
'm1.large': 7.5,
'm1.xlarge': 15,
'm2.xlarge': 17.5,
'm2.2xlarge': 34.2,
'm2.4xlarge': 68.4,
'c1.medium': 1.7,
'c1.xlarge': 7,
'cc1.4xlarge': 23,
'cg1.4xlarge': 22,
}
# Use this to figure out which hadoop version we're using if it's not
# explicitly specified, so we can keep from passing deprecated command-line
# options to Hadoop. If we encounter an AMI version we don't recognize,
# we use whatever version matches 'latest'.
#
# The reason we don't just create a job flow and then query its Hadoop version
# is that for most jobs, we create the steps and the job flow at the same time.
AMI_VERSION_TO_HADOOP_VERSION = {
None: '0.18', # ami_version not specified means version 1.0
'1.0': '0.18',
'2.0': '0.20.205',
'latest': '0.20.205',
}
# EMR's hard limit on number of steps in a job flow
MAX_STEPS_PER_JOB_FLOW = 256
def s3_key_to_uri(s3_key):
"""Convert a boto Key object into an ``s3://`` URI"""
return 's3://%s/%s' % (s3_key.bucket.name, s3_key.name)
# AWS actually gives dates in two formats, and we only recently started using
# API calls that return the second. So the date parsing function is called
# iso8601_to_*, but it also parses RFC1123.
# Until boto starts seamlessly parsing these, we check for them ourselves.
# Thu, 29 Mar 2012 04:55:44 GMT
RFC1123 = '%a, %d %b %Y %H:%M:%S %Z'
def iso8601_to_timestamp(iso8601_time):
iso8601_time = SUBSECOND_RE.sub('', iso8601_time)
try:
return time.mktime(time.strptime(iso8601_time, boto.utils.ISO8601))
except ValueError:
return time.mktime(time.strptime(iso8601_time, RFC1123))
def iso8601_to_datetime(iso8601_time):
iso8601_time = SUBSECOND_RE.sub('', iso8601_time)
try:
return datetime.strptime(iso8601_time, boto.utils.ISO8601)
except ValueError:
return datetime.strptime(iso8601_time, RFC1123)
def describe_all_job_flows(emr_conn, states=None, jobflow_ids=None,
created_after=None, created_before=None):
"""Iteratively call ``EmrConnection.describe_job_flows()`` until we really
get all the available job flow information. Currently, 2 months of data
is available through the EMR API.
This is a way of getting around the limits of the API, both on number
of job flows returned, and how far back in time we can go.
:type states: list
:param states: A list of strings with job flow states wanted
:type jobflow_ids: list
:param jobflow_ids: A list of job flow IDs
:type created_after: datetime
:param created_after: Bound on job flow creation time
:type created_before: datetime
:param created_before: Bound on job flow creation time
"""
all_job_flows = []
ids_seen = set()
# weird things can happen if we send no args the DescribeJobFlows API
# (see Issue #346), so if nothing else is set, set created_before
# to a day in the future.
if not (states or jobflow_ids or created_after or created_before):
created_before = datetime.utcnow() + timedelta(days=1)
while True:
if created_before and created_after and created_before < created_after:
break
log.debug('Calling describe_jobflows(states=%r, jobflow_ids=%r,'
' created_after=%r, created_before=%r)' %
(states, jobflow_ids, created_after, created_before))
try:
results = emr_conn.describe_jobflows(
states=states, jobflow_ids=jobflow_ids,
created_after=created_after, created_before=created_before)
except boto.exception.BotoServerError, ex:
if 'ValidationError' in ex.body:
log.debug(
' reached earliest allowed created_before time, done!')
break
else:
raise
# don't count the same job flow twice
job_flows = [jf for jf in results if jf.jobflowid not in ids_seen]
log.debug(' got %d results (%d new)' % (len(results), len(job_flows)))
all_job_flows.extend(job_flows)
ids_seen.update(jf.jobflowid for jf in job_flows)
if job_flows:
# set created_before to be just after the start time of
# the first job returned, to deal with job flows started
# in the same second
min_create_time = min(iso8601_to_datetime(jf.creationdatetime)
for jf in job_flows)
created_before = min_create_time + timedelta(seconds=1)
# if someone managed to start 501 job flows in the same second,
# they are still screwed (the EMR API only returns up to 500),
# but this seems unlikely. :)
else:
if not created_before:
created_before = datetime.utcnow()
created_before -= timedelta(weeks=2)
return all_job_flows
def make_lock_uri(s3_tmp_uri, emr_job_flow_id, step_num):
"""Generate the URI to lock the job flow ``emr_job_flow_id``"""
return s3_tmp_uri + 'locks/' + emr_job_flow_id + '/' + str(step_num)
def _lock_acquire_step_1(s3_conn, lock_uri, job_name, mins_to_expiration=None):
bucket_name, key_prefix = parse_s3_uri(lock_uri)
bucket = s3_conn.get_bucket(bucket_name)
key = bucket.get_key(key_prefix)
# EMRJobRunner should start using a job flow within about a second of
# locking it, so if it's been a while, then it probably crashed and we
# can just use this job flow.
key_expired = False
if key and mins_to_expiration is not None:
last_modified = iso8601_to_datetime(key.last_modified)
age = datetime.utcnow() - last_modified
if age > timedelta(minutes=mins_to_expiration):
key_expired = True
if key is None or key_expired:
key = bucket.new_key(key_prefix)
key.set_contents_from_string(job_name)
return key
else:
return None
def _lock_acquire_step_2(key, job_name):
key_value = key.get_contents_as_string()
return (key_value == job_name)
def attempt_to_acquire_lock(s3_conn, lock_uri, sync_wait_time, job_name,
mins_to_expiration=None):
"""Returns True if this session successfully took ownership of the lock
specified by ``lock_uri``.
"""
key = _lock_acquire_step_1(s3_conn, lock_uri, job_name, mins_to_expiration)
if key is not None:
time.sleep(sync_wait_time)
success = _lock_acquire_step_2(key, job_name)
if success:
return True
return False
class LogFetchError(Exception):
pass
class EMRJobRunner(MRJobRunner):
"""Runs an :py:class:`~mrjob.job.MRJob` on Amazon Elastic MapReduce.
:py:class:`EMRJobRunner` runs your job in an EMR job flow, which is
basically a temporary Hadoop cluster. Normally, it creates a job flow
just for your job; it's also possible to run your job in a specific
job flow by setting *emr_job_flow_id* or to automatically choose a
waiting job flow, creating one if none exists, by setting
*pool_emr_job_flows*.
Input, support, and jar files can be either local or on S3; use
``s3://...`` URLs to refer to files on S3.
This class has some useful utilities for talking directly to S3 and EMR,
so you may find it useful to instantiate it without a script::
from mrjob.emr import EMRJobRunner
emr_conn = EMRJobRunner().make_emr_conn()
job_flows = emr_conn.describe_jobflows()
...
See also: :py:meth:`~EMRJobRunner.__init__`.
"""
alias = 'emr'
def __init__(self, **kwargs):
""":py:class:`~mrjob.emr.EMRJobRunner` takes the same arguments as
:py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
*aws_access_key_id* and *aws_secret_access_key* are required if you
haven't set them up already for boto (e.g. by setting the environment
variables :envvar:`AWS_ACCESS_KEY_ID` and
:envvar:`AWS_SECRET_ACCESS_KEY`)
Additional options:
:type additional_emr_info: JSON str, None, or JSON-encodable object
:param additional_emr_info: Special parameters to select additional
features, mostly to support beta EMR
features. Pass a JSON string on the command
line or use data structures in the config
file (which is itself basically JSON).
:type ami_version: str
:param ami_version: EMR AMI version to use. This controls which Hadoop
version(s) are available and which version of
Python is installed, among other things; see \
http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuideindex.html?EnvironmentConfig_AMIVersion.html
for details. Implicitly defaults to AMI version
1.0 (this will change to 2.0 in mrjob v0.4).
:type aws_access_key_id: str
:param aws_access_key_id: "username" for Amazon web services.
:type aws_availability_zone: str
:param aws_availability_zone: availability zone to run the job in
:type aws_secret_access_key: str
:param aws_secret_access_key: your "password" on AWS
:type aws_region: str
:param aws_region: region to connect to S3 and EMR on (e.g.
``us-west-1``). If you want to use separate regions
for S3 and EMR, set *emr_endpoint* and
*s3_endpoint*.
:type bootstrap_actions: list of str
:param bootstrap_actions: a list of raw bootstrap actions (essentially
scripts) to run prior to any of the other
bootstrap steps. Any arguments should be
separated from the command by spaces (we use
:py:func:`shlex.split`). If the action is on
the local filesystem, we'll automatically
upload it to S3.
:type bootstrap_cmds: list
:param bootstrap_cmds: a list of commands to run on the master node to
set up libraries, etc. Like *setup_cmds*, these
can be strings, which will be run in the shell,
or lists of args, which will be run directly.
Prepend ``sudo`` to commands to do things that
require root privileges.
:type bootstrap_files: list of str
:param bootstrap_files: files to download to the bootstrap working
directory on the master node before running
*bootstrap_cmds* (for example, Debian
packages). May be local files for mrjob to
upload to S3, or any URI that ``hadoop fs``
can handle.
:type bootstrap_mrjob: boolean
:param bootstrap_mrjob: This is actually an option in the base
:py:class:`~mrjob.job.MRJobRunner` class. If
this is ``True`` (the default), we'll tar up
:py:mod:`mrjob` from the local filesystem, and
install it on the master node.
:type bootstrap_python_packages: list of str
:param bootstrap_python_packages: paths of python modules to install
on EMR. These should be standard
Python module tarballs. If a module
is named ``foo.tar.gz``, we expect to
be able to run ``tar xfz foo.tar.gz;
cd foo;
sudo python setup.py install``.
:type bootstrap_scripts: list of str
:param bootstrap_scripts: scripts to upload and then run on the master
node (a combination of *bootstrap_cmds* and
*bootstrap_files*). These are run after the
command from bootstrap_cmds.
:type check_emr_status_every: float
:param check_emr_status_every: How often to check on the status of EMR
jobs. Default is 30 seconds (too often
and AWS will throttle you anyway).
:type ec2_instance_type: str
:param ec2_instance_type: What sort of EC2 instance(s) to use on the
nodes that actually run tasks (see
http://aws.amazon.com/ec2/instance-types/).
When you run multiple instances (see
*num_ec2_instances*), the master node is just
coordinating the other nodes, so usually the
default instance type (``m1.small``) is fine,
and using larger instances is wasteful.
:type ec2_key_pair: str
:param ec2_key_pair: name of the SSH key you set up for EMR.
:type ec2_key_pair_file: str
:param ec2_key_pair_file: path to file containing the SSH key for EMR
:type ec2_core_instance_type: str
:param ec2_core_instance_type: like *ec2_instance_type*, but only
for the core (also know as "slave")
Hadoop nodes; these nodes run tasks and
host HDFS. Usually you just want to use
*ec2_instance_type*. Defaults to
``'m1.small'``.
:type ec2_core_instance_bid_price: str
:param ec2_core_instance_bid_price: when specified and not "0", this
creates the master Hadoop node as
a spot instance at this bid price.
You usually only want to set
bid price for task instances.
:type ec2_master_instance_type: str
:param ec2_master_instance_type: like *ec2_instance_type*, but only
for the master Hadoop node. This node
hosts the task tracker and HDFS, and
runs tasks if there are no other
nodes. Usually you just want to use
*ec2_instance_type*. Defaults to
``'m1.small'``.
:type ec2_master_instance_bid_price: str
:param ec2_master_instance_bid_price: when specified and not "0", this
creates the master Hadoop node as
a spot instance at this bid
price. You usually only want to
set bid price for task instances
unless the master instance is
your only instance.
:type ec2_slave_instance_type: str
:param ec2_slave_instance_type: An alias for *ec2_core_instance_type*,
for consistency with the EMR API.
:type ec2_task_instance_type: str
:param ec2_task_instance_type: like *ec2_instance_type*, but only
for the task Hadoop nodes; these nodes
run tasks but do not host HDFS. Usually
you just want to use
*ec2_instance_type*. Defaults to
the same instance type as
*ec2_core_instance_type*.
:param ec2_task_instance_bid_price: when specified and not "0", this
creates the master Hadoop node as
a spot instance at this bid price.
(You usually only want to set
bid price for task instances.)
:type emr_endpoint: str
:param emr_endpoint: optional host to connect to when communicating
with S3 (e.g.
``us-west-1.elasticmapreduce.amazonaws.com``).
Default is to infer this from *aws_region*.
:type emr_job_flow_id: str
:param emr_job_flow_id: the ID of a persistent EMR job flow to run jobs
in (normally we launch our own job flow). It's
fine for other jobs to be using the job flow;
we give our job's steps a unique ID.
:type emr_job_flow_pool_name: str
:param emr_job_flow_pool_name: Specify a pool name to join. Is set to
``'default'`` if not specified. Does not
imply ``pool_emr_job_flows``.
:type enable_emr_debugging: str
:param enable_emr_debugging: store Hadoop logs in SimpleDB
:type hadoop_streaming_jar: str
:param hadoop_streaming_jar: This is actually an option in the base
:py:class:`~mrjob.runner.MRJobRunner`
class. Points to a custom hadoop streaming
jar on the local filesystem or S3. If you
want to point to a streaming jar already
installed on the EMR instances (perhaps
through a bootstrap action?), use
*hadoop_streaming_jar_on_emr*.
:type hadoop_streaming_jar_on_emr: str
:param hadoop_streaming_jar_on_emr: Like *hadoop_streaming_jar*, except
that it points to a path on the EMR
instance, rather than to a local
file or one on S3. Rarely necessary
to set this by hand.
:type hadoop_version: str
:param hadoop_version: Set the version of Hadoop to use on EMR.
Consider setting *ami_version* instead; only AMI
version 1.0 supports multiple versions of Hadoop
anyway. If *ami_version* is not set, we'll
default to Hadoop 0.20 for backwards
compatibility with :py:mod:`mrjob` v0.3.0.
:type num_ec2_core_instances: int
:param num_ec2_core_instances: Number of core (or "slave") instances to
start up. These run your job and host
HDFS. Incompatible with
*num_ec2_instances*. This is in addition
to the single master instance.
:type num_ec2_instances: int
:param num_ec2_instances: Total number of instances to start up;
basically the number of core instance you
want, plus 1 (there is always one master
instance). Default is ``1``. Incompatible
with *num_ec2_core_instances* and
*num_ec2_task_instances*.
:type num_ec2_task_instances: int
:param num_ec2_task_instances: number of task instances to start up.
These run your job but do not host
HDFS. Incompatible with
*num_ec2_instances*. If you use this,
you must set *num_ec2_core_instances*;
EMR does not allow you to run task
instances without core instances (because
there's nowhere to host HDFS).
:type pool_emr_job_flows: bool
:param pool_emr_job_flows: Try to run the job on a ``WAITING`` pooled
job flow with the same bootstrap
configuration. Prefer the one with the most
compute units. Use S3 to "lock" the job flow
and ensure that the job is not scheduled
behind another job. If no suitable job flow
is `WAITING`, create a new pooled job flow.
**WARNING**: do not run this without having\
:py:mod:`mrjob.tools.emr.terminate.idle_job_flows`
in your crontab; job flows left idle can
quickly become expensive!
:type s3_endpoint: str
:param s3_endpoint: Host to connect to when communicating with S3 (e.g.
``s3-us-west-1.amazonaws.com``). Default is to
infer this from *aws_region*.
:type s3_log_uri: str
:param s3_log_uri: where on S3 to put logs, for example
``s3://yourbucket/logs/``. Logs for your job flow
will go into a subdirectory, e.g.
``s3://yourbucket/logs/j-JOBFLOWID/``. in this
example s3://yourbucket/logs/j-YOURJOBID/). Default
is to append ``logs/`` to *s3_scratch_uri*.
:type s3_scratch_uri: str
:param s3_scratch_uri: S3 directory (URI ending in ``/``) to use as
scratch space, e.g. ``s3://yourbucket/tmp/``.
Default is ``tmp/mrjob/`` in the first bucket
belonging to you.
:type s3_sync_wait_time: float
:param s3_sync_wait_time: How long to wait for S3 to reach eventual
consistency. This is typically less than a
second (zero in U.S. West) but the default is
5.0 to be safe.
:type ssh_bin: str or list
:param ssh_bin: path to the ssh binary; may include switches (e.g.
``'ssh -v'`` or ``['ssh', '-v']``). Defaults to
:command:`ssh`
:type ssh_bind_ports: list of int
:param ssh_bind_ports: a list of ports that are safe to listen on.
Defaults to ports ``40001`` thru ``40840``.
:type ssh_tunnel_to_job_tracker: bool
:param ssh_tunnel_to_job_tracker: If True, create an ssh tunnel to the
job tracker and listen on a randomly
chosen port. This requires you to set
*ec2_key_pair* and
*ec2_key_pair_file*. See
:ref:`ssh-tunneling` for detailed
instructions.
:type ssh_tunnel_is_open: bool
:param ssh_tunnel_is_open: if True, any host can connect to the job
tracker through the SSH tunnel you open.
Mostly useful if your browser is running on
a different machine from your job runner.
"""
super(EMRJobRunner, self).__init__(**kwargs)
# make aws_region an instance variable; we might want to set it
# based on the scratch bucket
self._aws_region = self._opts['aws_region'] or ''
# if we're going to create a bucket to use as temp space, we don't
# want to actually create it until we run the job (Issue #50).
# This variable helps us create the bucket as needed
self._s3_temp_bucket_to_create = None
self._fix_s3_scratch_and_log_uri_opts()
self._fix_ec2_instance_opts()
# pick a tmp dir based on the job name
self._s3_tmp_uri = self._opts['s3_scratch_uri'] + self._job_name + '/'
# pick/validate output dir
if self._output_dir:
self._output_dir = self._check_and_fix_s3_dir(self._output_dir)
else:
self._output_dir = self._s3_tmp_uri + 'output/'
# add the bootstrap files to a list of files to upload
self._bootstrap_actions = []
for action in self._opts['bootstrap_actions']:
args = shlex.split(action)
if not args:
raise ValueError('bad bootstrap action: %r' % (action,))
# don't use _add_bootstrap_file() because this is a raw bootstrap
# action, not part of mrjob's bootstrap utilities
file_dict = self._add_file(args[0])
file_dict['args'] = args[1:]
self._bootstrap_actions.append(file_dict)
for path in self._opts['bootstrap_files']:
self._add_bootstrap_file(path)
self._bootstrap_scripts = []
for path in self._opts['bootstrap_scripts']:
file_dict = self._add_bootstrap_file(path)
self._bootstrap_scripts.append(file_dict)
self._bootstrap_python_packages = []
for path in self._opts['bootstrap_python_packages']:
name, path = self._split_path(path)
if not path.endswith('.tar.gz'):
raise ValueError(
'bootstrap_python_packages only accepts .tar.gz files!')
file_dict = self._add_bootstrap_file(path)
self._bootstrap_python_packages.append(file_dict)
self._streaming_jar = None
if self._opts.get('hadoop_streaming_jar'):
self._streaming_jar = self._add_file_for_upload(
self._opts['hadoop_streaming_jar'])
if not (isinstance(self._opts['additional_emr_info'], basestring)
or self._opts['additional_emr_info'] is None):
self._opts['additional_emr_info'] = json.dumps(
self._opts['additional_emr_info'])
# if we're bootstrapping mrjob, keep track of the file_dict
# for mrjob.tar.gz
self._mrjob_tar_gz_file = None
# where our own logs ended up (we'll find this out once we run the job)
self._s3_job_log_uri = None
# where to get input from. We'll fill this later. Once filled,
# this must be a list (not some other sort of container)
self._s3_input_uris = None
# we'll create the script later
self._master_bootstrap_script = None
# the ID assigned by EMR to this job (might be None)
self._emr_job_flow_id = self._opts['emr_job_flow_id']
# when did our particular task start?
self._emr_job_start = None
# ssh state
self._ssh_proc = None
self._gave_cant_ssh_warning = False
self._ssh_key_name = None
# cache for SSH address
self._address = None
self._ssh_slave_addrs = None
# store the tracker URL for completion status
self._tracker_url = None
# turn off tracker progress until tunnel is up
self._show_tracker_progress = False
# default requested hadoop version if AMI version is not set
if not (self._opts['ami_version'] or self._opts['hadoop_version']):
self._opts['hadoop_version'] = '0.20'
# init hadoop version cache
self._inferred_hadoop_version = None
@classmethod
def _allowed_opts(cls):
"""A list of which keyword args we can pass to __init__()"""
return super(EMRJobRunner, cls)._allowed_opts() + [
'additional_emr_info',
'ami_version',
'aws_access_key_id',
'aws_availability_zone',
'aws_region',
'aws_secret_access_key',
'bootstrap_actions',
'bootstrap_cmds',
'bootstrap_files',
'bootstrap_python_packages',
'bootstrap_scripts',
'check_emr_status_every',
'ec2_core_instance_bid_price',
'ec2_core_instance_type',
'ec2_instance_type',
'ec2_key_pair',
'ec2_key_pair_file',
'ec2_master_instance_bid_price',
'ec2_master_instance_type',
'ec2_slave_instance_type',
'ec2_task_instance_bid_price',
'ec2_task_instance_type',
'emr_endpoint',
'emr_job_flow_id',
'emr_job_flow_pool_name',
'enable_emr_debugging',
'enable_emr_debugging',
'hadoop_streaming_jar_on_emr',
'hadoop_version',
'num_ec2_core_instances',
'num_ec2_instances',
'num_ec2_task_instances',
'pool_emr_job_flows',
's3_endpoint',
's3_log_uri',
's3_scratch_uri',
's3_sync_wait_time',
'ssh_bin',
'ssh_bind_ports',
'ssh_tunnel_is_open',
'ssh_tunnel_to_job_tracker',
]
@classmethod
def _default_opts(cls):
"""A dictionary giving the default value of options."""
return combine_dicts(super(EMRJobRunner, cls)._default_opts(), {
'check_emr_status_every': 30,
'ec2_core_instance_type': 'm1.small',
'ec2_master_instance_type': 'm1.small',
'emr_job_flow_pool_name': 'default',
'hadoop_version': None, # defaulted in __init__()
'hadoop_streaming_jar_on_emr':
'/home/hadoop/contrib/streaming/hadoop-streaming.jar',
'num_ec2_core_instances': 0,
'num_ec2_instances': 1,
'num_ec2_task_instances': 0,
's3_sync_wait_time': 5.0,
'ssh_bin': ['ssh'],
'ssh_bind_ports': range(40001, 40841),
'ssh_tunnel_to_job_tracker': False,
'ssh_tunnel_is_open': False,
})
@classmethod
def _opts_combiners(cls):
"""Map from option name to a combine_*() function used to combine
values for that option. This allows us to specify that some options
are lists, or contain environment variables, or whatever."""
return combine_dicts(super(EMRJobRunner, cls)._opts_combiners(), {
'bootstrap_actions': combine_lists,
'bootstrap_cmds': combine_lists,
'bootstrap_files': combine_path_lists,
'bootstrap_python_packages': combine_path_lists,
'bootstrap_scripts': combine_path_lists,
'ec2_key_pair_file': combine_paths,
's3_log_uri': combine_paths,
's3_scratch_uri': combine_paths,
'ssh_bin': combine_cmds,
})
def _fix_ec2_instance_opts(self):
"""If the *ec2_instance_type* option is set, override instance
type for the nodes that actually run tasks (see Issue #66). Allow
command-line arguments to override defaults and arguments
in mrjob.conf (see Issue #311).
Also, make sure that core and slave instance type are the same,
total number of instances matches number of master, core, and task
instances, and that bid prices of zero are converted to None.
Helper for __init__.
"""
# Make sure slave and core instance type have the same value
# Within EMRJobRunner we only ever use ec2_core_instance_type,
# but we want ec2_slave_instance_type to be correct in the
# options dictionary.
if (self._opts['ec2_slave_instance_type'] and
(self._opt_priority['ec2_slave_instance_type'] >
self._opt_priority['ec2_core_instance_type'])):
self._opts['ec2_core_instance_type'] = (
self._opts['ec2_slave_instance_type'])
else:
self._opts['ec2_slave_instance_type'] = (
self._opts['ec2_core_instance_type'])
# If task instance type is not set, use core instance type
# (This is mostly so that we don't inadvertently join a pool
# with task instance types with too little memory.)
if not self._opts['ec2_task_instance_type']:
self._opts['ec2_task_instance_type'] = (
self._opts['ec2_core_instance_type'])
# Within EMRJobRunner, we use num_ec2_core_instances and
# num_ec2_task_instances, not num_ec2_instances. (Number
# of master instances is always 1.)
if (self._opt_priority['num_ec2_instances'] >
max(self._opt_priority['num_ec2_core_instances'],
self._opt_priority['num_ec2_task_instances'])):
# assume 1 master, n - 1 core, 0 task
self._opts['num_ec2_core_instances'] = (
self._opts['num_ec2_instances'] - 1)
self._opts['num_ec2_task_instances'] = 0
else:
# issue a warning if we used both kinds of instance number
# options on the command line or in mrjob.conf
if (self._opt_priority['num_ec2_instances'] >= 2 and
self._opt_priority['num_ec2_instances'] <=
max(self._opt_priority['num_ec2_core_instances'],
self._opt_priority['num_ec2_task_instances'])):
log.warn('Mixing num_ec2_instances and'
' num_ec2_{core,task}_instances does not make sense;'
' ignoring num_ec2_instances')
# recalculate number of EC2 instances
self._opts['num_ec2_instances'] = (
1 +
self._opts['num_ec2_core_instances'] +
self._opts['num_ec2_task_instances'])
# Allow ec2 instance type to override other instance types
ec2_instance_type = self._opts['ec2_instance_type']
if ec2_instance_type:
# core (slave) instances
if (self._opt_priority['ec2_instance_type'] >
max(self._opt_priority['ec2_core_instance_type'],
self._opt_priority['ec2_slave_instance_type'])):
self._opts['ec2_core_instance_type'] = ec2_instance_type
self._opts['ec2_slave_instance_type'] = ec2_instance_type
# master instance only does work when it's the only instance
if (self._opts['num_ec2_core_instances'] <= 0 and
self._opts['num_ec2_task_instances'] <= 0 and
(self._opt_priority['ec2_instance_type'] >
self._opt_priority['ec2_master_instance_type'])):
self._opts['ec2_master_instance_type'] = ec2_instance_type
# task instances
if (self._opt_priority['ec2_instance_type'] >
self._opt_priority['ec2_task_instance_type']):
self._opts['ec2_task_instance_type'] = ec2_instance_type
# convert a bid price of '0' to None
for role in ('core', 'master', 'task'):
opt_name = 'ec2_%s_instance_bid_price' % role
if not self._opts[opt_name]:
self._opts[opt_name] = None
else:
# convert "0", "0.00" etc. to None
try:
value = float(self._opts[opt_name])
if value == 0:
self._opts[opt_name] = None
except ValueError:
pass # maybe EMR will accept non-floats?
def _fix_s3_scratch_and_log_uri_opts(self):
"""Fill in s3_scratch_uri and s3_log_uri (in self._opts) if they
aren't already set.
Helper for __init__.
"""
s3_conn = self.make_s3_conn()
# check s3_scratch_uri against aws_region if specified
if self._opts['s3_scratch_uri']:
bucket_name, _ = parse_s3_uri(self._opts['s3_scratch_uri'])
bucket_loc = s3_conn.get_bucket(bucket_name).get_location()
# make sure they can communicate if both specified
if (self._aws_region and bucket_loc and
self._aws_region != bucket_loc):
log.warning('warning: aws_region (%s) does not match bucket'
' region (%s). Your EC2 instances may not be able'
' to reach your S3 buckets.' %
(self._aws_region, bucket_loc))
# otherwise derive aws_region from bucket_loc
elif bucket_loc and not self._aws_region:
log.info(
"inferring aws_region from scratch bucket's region (%s)" %
bucket_loc)
self._aws_region = bucket_loc
# set s3_scratch_uri by checking for existing buckets
else:
self._set_s3_scratch_uri(s3_conn)
log.info('using %s as our scratch dir on S3' %
self._opts['s3_scratch_uri'])
self._opts['s3_scratch_uri'] = self._check_and_fix_s3_dir(
self._opts['s3_scratch_uri'])
# set s3_log_uri
if self._opts['s3_log_uri']:
self._opts['s3_log_uri'] = self._check_and_fix_s3_dir(
self._opts['s3_log_uri'])
else:
self._opts['s3_log_uri'] = self._opts['s3_scratch_uri'] + 'logs/'
def _set_s3_scratch_uri(self, s3_conn):
"""Helper for _fix_s3_scratch_and_log_uri_opts"""
buckets = s3_conn.get_all_buckets()
mrjob_buckets = [b for b in buckets if b.name.startswith('mrjob-')]
# Loop over buckets until we find one that is not region-
# restricted, matches aws_region, or can be used to
# infer aws_region if no aws_region is specified
for scratch_bucket in mrjob_buckets:
scratch_bucket_name = scratch_bucket.name
scratch_bucket_location = scratch_bucket.get_location()
if scratch_bucket_location:
if scratch_bucket_location == self._aws_region:
# Regions are both specified and match
log.info("using existing scratch bucket %s" %
scratch_bucket_name)
self._opts['s3_scratch_uri'] = (
's3://%s/tmp/' % scratch_bucket_name)
return
elif not self._aws_region:
# aws_region not specified, so set it based on this
# bucket's location and use this bucket
self._aws_region = scratch_bucket_location
log.info("inferring aws_region from scratch bucket's"
" region (%s)" % self._aws_region)
self._opts['s3_scratch_uri'] = (
's3://%s/tmp/' % scratch_bucket_name)
return
elif scratch_bucket_location != self._aws_region:
continue
elif not self._aws_region:
# Only use regionless buckets if the job flow is regionless
log.info("using existing scratch bucket %s" %
scratch_bucket_name)
self._opts['s3_scratch_uri'] = (
's3://%s/tmp/' % scratch_bucket_name)
return
# That may have all failed. If so, pick a name.
scratch_bucket_name = 'mrjob-%016x' % random.randint(0, 2 ** 64 - 1)
self._s3_temp_bucket_to_create = scratch_bucket_name
log.info("creating new scratch bucket %s" % scratch_bucket_name)
self._opts['s3_scratch_uri'] = 's3://%s/tmp/' % scratch_bucket_name
def _set_s3_job_log_uri(self, job_flow):
"""Given a job flow description, set self._s3_job_log_uri. This allows
us to call self.ls(), etc. without running the job.
"""
log_uri = getattr(job_flow, 'loguri', '')
if log_uri:
self._s3_job_log_uri = '%s%s/' % (
log_uri.replace('s3n://', 's3://'), self._emr_job_flow_id)
def _create_s3_temp_bucket_if_needed(self):
"""Make sure temp bucket exists"""
if self._s3_temp_bucket_to_create:
s3_conn = self.make_s3_conn()
log.info('creating S3 bucket %r to use as scratch space' %
self._s3_temp_bucket_to_create)
location = REGION_TO_S3_LOCATION_CONSTRAINT.get(
self._aws_region, self._aws_region)
s3_conn.create_bucket(self._s3_temp_bucket_to_create,
location=(location or ''))
self._s3_temp_bucket_to_create = None
def _check_and_fix_s3_dir(self, s3_uri):
"""Helper for __init__"""
if not is_s3_uri(s3_uri):
raise ValueError('Invalid S3 URI: %r' % s3_uri)
if not s3_uri.endswith('/'):
s3_uri = s3_uri + '/'
return s3_uri
def _run(self):
self._prepare_for_launch()
self._launch_emr_job()
self._wait_for_job_to_complete()
def _prepare_for_launch(self):
self._setup_input()
self._create_wrapper_script()
self._create_master_bootstrap_script()
self._upload_non_input_files()
def _setup_input(self):
"""Copy local input files (if any) to a special directory on S3.
Set self._s3_input_uris
Helper for _run
"""
self._create_s3_temp_bucket_if_needed()
# winnow out s3 files from local ones
self._s3_input_uris = []
local_input_paths = []
for path in self._input_paths:
if is_s3_uri(path):
# Don't even bother running the job if the input isn't there,
# since it's costly to spin up instances.
if not self.path_exists(path):
raise AssertionError(
'Input path %s does not exist!' % (path,))
self._s3_input_uris.append(path)
else:
local_input_paths.append(path)
# copy local files into an input directory, with names like
# 00000-actual_name.ext
if local_input_paths:
s3_input_dir = self._s3_tmp_uri + 'input/'
log.info('Uploading input to %s' % s3_input_dir)
s3_conn = self.make_s3_conn()
for file_num, path in enumerate(local_input_paths):
if path == '-':
path = self._dump_stdin_to_local_file()
target = '%s%05d-%s' % (
s3_input_dir, file_num, os.path.basename(path))
log.debug('uploading %s -> %s' % (path, target))
s3_key = self.make_s3_key(target, s3_conn)
s3_key.set_contents_from_filename(path)
self._s3_input_uris.append(s3_input_dir)
def _add_bootstrap_file(self, path):
name, path = self._split_path(path)
file_dict = {'path': path, 'name': name, 'bootstrap': 'file'}
self._files.append(file_dict)
return file_dict
def _pick_s3_uris_for_files(self):
"""Decide where each file will be uploaded on S3.
Okay to call this multiple times.
"""
self._assign_unique_names_to_files(
's3_uri', prefix=self._s3_tmp_uri + 'files/',
match=is_s3_uri)
def _upload_non_input_files(self):
"""Copy files to S3
Pick S3 URIs for them if we haven't already."""
self._create_s3_temp_bucket_if_needed()
self._pick_s3_uris_for_files()
s3_files_dir = self._s3_tmp_uri + 'files/'
log.info('Copying non-input files into %s' % s3_files_dir)
s3_conn = self.make_s3_conn()
for file_dict in self._files:
path = file_dict['path']
# don't bother with files that are already on s3
if is_s3_uri(path):
continue
s3_uri = file_dict['s3_uri']
log.debug('uploading %s -> %s' % (path, s3_uri))
s3_key = self.make_s3_key(s3_uri, s3_conn)
s3_key.set_contents_from_filename(file_dict['path'])
def setup_ssh_tunnel_to_job_tracker(self, host):
"""setup the ssh tunnel to the job tracker, if it's not currently
running.
Args:
host -- hostname of the EMR master node.
"""
REQUIRED_OPTS = ['ec2_key_pair', 'ec2_key_pair_file', 'ssh_bind_ports']
for opt_name in REQUIRED_OPTS:
if not self._opts[opt_name]:
if not self._gave_cant_ssh_warning:
log.warning(
"You must set %s in order to ssh to the job tracker!" %
opt_name)
self._gave_cant_ssh_warning = True
return
# if there was already a tunnel, make sure it's still up
if self._ssh_proc:
self._ssh_proc.poll()
if self._ssh_proc.returncode is None:
return
else:
log.warning('Oops, ssh subprocess exited with return code %d,'
' restarting...' % self._ssh_proc.returncode)
self._ssh_proc = None
log.info('Opening ssh tunnel to Hadoop job tracker')
# if ssh detects that a host key has changed, it will silently not
# open the tunnel, so make a fake empty known_hosts file and use that.
# (you can actually use /dev/null as your known hosts file, but
# that's UNIX-specific)
fake_known_hosts_file = os.path.join(
self._get_local_tmp_dir(), 'fake_ssh_known_hosts')
# blank out the file, if it exists
f = open(fake_known_hosts_file, 'w')
f.close()
log.debug('Created empty ssh known-hosts file: %s' % (
fake_known_hosts_file,))
bind_port = None
for bind_port in self._pick_ssh_bind_ports():
args = self._opts['ssh_bin'] + [
'-o', 'VerifyHostKeyDNS=no',
'-o', 'StrictHostKeyChecking=no',
'-o', 'ExitOnForwardFailure=yes',
'-o', 'UserKnownHostsFile=%s' % fake_known_hosts_file,
'-L', '%d:localhost:%d' % (bind_port, EMR_JOB_TRACKER_PORT),
'-N', '-q', # no shell, no output
'-i', self._opts['ec2_key_pair_file'],
]
if self._opts['ssh_tunnel_is_open']:
args.extend(['-g', '-4']) # -4: listen on IPv4 only
args.append('hadoop@' + host)
log.debug('> %s' % cmd_line(args))
ssh_proc = Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE)
time.sleep(WAIT_FOR_SSH_TO_FAIL)
ssh_proc.poll()
# still running. We are golden
if ssh_proc.returncode is None:
self._ssh_proc = ssh_proc
break
if not self._ssh_proc:
log.warning('Failed to open ssh tunnel to job tracker')
else:
if self._opts['ssh_tunnel_is_open']:
bind_host = socket.getfqdn()
else:
bind_host = 'localhost'
self._tracker_url = 'http://%s:%d%s' % (
bind_host, bind_port, EMR_JOB_TRACKER_PATH)
self._show_tracker_progress = True
log.info('Connect to job tracker at: %s' % self._tracker_url)
def _pick_ssh_bind_ports(self):
"""Pick a list of ports to try binding our SSH tunnel to.
We will try to bind the same port for any given job flow (Issue #67)
"""
# don't perturb the random number generator
random_state = random.getstate()
try:
# seed random port selection on job flow ID
random.seed(self._emr_job_flow_id)
num_picks = min(MAX_SSH_RETRIES, len(self._opts['ssh_bind_ports']))
return random.sample(self._opts['ssh_bind_ports'], num_picks)
finally:
random.setstate(random_state)
def _enable_slave_ssh_access(self):
if not self._ssh_key_name:
self._ssh_key_name = self._job_name + '.pem'
ssh_copy_key(
self._opts['ssh_bin'],
self._address_of_master(),
self._opts['ec2_key_pair_file'],
self._ssh_key_name)
### Running the job ###
def cleanup(self, mode=None):
super(EMRJobRunner, self).cleanup(mode=mode)
# always stop our SSH tunnel if it's still running
if self._ssh_proc:
self._ssh_proc.poll()
if self._ssh_proc.returncode is None:
log.info('Killing our SSH tunnel (pid %d)' %
self._ssh_proc.pid)
try:
os.kill(self._ssh_proc.pid, signal.SIGKILL)
self._ssh_proc = None
except Exception, e:
log.exception(e)
# stop the job flow if it belongs to us (it may have stopped on its
# own already, but that's fine)
# don't stop it if it was created due to --pool because the user
# probably wants to use it again
if self._emr_job_flow_id and not self._opts['emr_job_flow_id'] \
and not self._opts['pool_emr_job_flows']:
log.info('Terminating job flow: %s' % self._emr_job_flow_id)
try:
self.make_emr_conn().terminate_jobflow(self._emr_job_flow_id)
except Exception, e:
log.exception(e)
def _cleanup_remote_scratch(self):
# delete all the files we created
if self._s3_tmp_uri:
try:
log.info('Removing all files in %s' % self._s3_tmp_uri)
self.rm(self._s3_tmp_uri)
self._s3_tmp_uri = None
except Exception, e:
log.exception(e)
def _cleanup_logs(self):
super(EMRJobRunner, self)._cleanup_logs()
# delete the log files, if it's a job flow we created (the logs
# belong to the job flow)
if self._s3_job_log_uri and not self._opts['emr_job_flow_id'] \
and not self._opts['pool_emr_job_flows']:
try:
log.info('Removing all files in %s' % self._s3_job_log_uri)
self.rm(self._s3_job_log_uri)
self._s3_job_log_uri = None
except Exception, e:
log.exception(e)
def _wait_for_s3_eventual_consistency(self):
"""Sleep for a little while, to give S3 a chance to sync up.
"""
log.info('Waiting %.1fs for S3 eventual consistency' %
self._opts['s3_sync_wait_time'])
time.sleep(self._opts['s3_sync_wait_time'])
def _wait_for_job_flow_termination(self):
try:
jobflow = self._describe_jobflow()
except boto.exception.S3ResponseError:
# mockboto throws this for some reason
return
if (jobflow.keepjobflowalivewhennosteps == 'true' and
jobflow.state == 'WAITING'):
raise Exception('Operation requires job flow to terminate, but'
' it may never do so.')
while jobflow.state not in ('TERMINATED', 'COMPLETED', 'FAILED',
'SHUTTING_DOWN'):
msg = 'Waiting for job flow to terminate (currently %s)' % \
jobflow.state
log.info(msg)
time.sleep(self._opts['check_emr_status_every'])
jobflow = self._describe_jobflow()
def _create_instance_group(self, role, instance_type, count, bid_price):
"""Helper method for creating instance groups. For use when
creating a jobflow using a list of InstanceGroups, instead
of the typical triumverate of
num_instances/master_instance_type/slave_instance_type.
- Role is either 'master', 'core', or 'task'.
- instance_type is an EC2 instance type
- count is an int
- bid_price is a number, a string, or None. If None,
this instance group will be use the ON-DEMAND market
instead of the SPOT market.
"""
if not instance_type:
if self._opts['ec2_instance_type']:
instance_type = self._opts['ec2_instance_type']
else:
raise ValueError('Missing instance type for %s node(s)'
% role)
if bid_price:
market = 'SPOT'
bid_price = str(bid_price) # must be a string
else:
market = 'ON_DEMAND'
bid_price = None
# Just name the groups "master", "task", and "core"
name = role.lower()
return boto_2_1_1_83aae37b.InstanceGroup(
count, role, instance_type, market, name, bidprice=bid_price
)
def _create_job_flow(self, persistent=False, steps=None):
"""Create an empty job flow on EMR, and return the ID of that
job.
persistent -- if this is true, create the job flow with the --alive
option, indicating the job will have to be manually terminated.
"""
# make sure we can see the files we copied to S3
self._wait_for_s3_eventual_consistency()
# figure out local names and S3 URIs for our bootstrap actions, if any
self._name_files()
self._pick_s3_uris_for_files()
log.info('Creating Elastic MapReduce job flow')
args = self._job_flow_args(persistent, steps)
emr_conn = self.make_emr_conn()
log.debug('Calling run_jobflow(%r, %r, %s)' % (
self._job_name, self._opts['s3_log_uri'],
', '.join('%s=%r' % (k, v) for k, v in args.iteritems())))
emr_job_flow_id = emr_conn.run_jobflow(
self._job_name, self._opts['s3_log_uri'], **args)
# keep track of when we started our job
self._emr_job_start = time.time()
log.info('Job flow created with ID: %s' % emr_job_flow_id)
return emr_job_flow_id
def _job_flow_args(self, persistent=False, steps=None):
"""Build kwargs for emr_conn.run_jobflow()"""
args = {}
args['ami_version'] = self._opts['ami_version']
args['hadoop_version'] = self._opts['hadoop_version']
if self._opts['aws_availability_zone']:
args['availability_zone'] = self._opts['aws_availability_zone']
# The old, simple API, available if we're not using task instances
# or bid prices
if not (self._opts['num_ec2_task_instances'] or
self._opts['ec2_core_instance_bid_price'] or
self._opts['ec2_master_instance_bid_price'] or
self._opts['ec2_task_instance_bid_price']):
args['num_instances'] = self._opts['num_ec2_core_instances'] + 1
args['master_instance_type'] = (
self._opts['ec2_master_instance_type'])
args['slave_instance_type'] = self._opts['ec2_core_instance_type']
else:
# Create a list of InstanceGroups
args['instance_groups'] = [
self._create_instance_group(
'MASTER',
self._opts['ec2_master_instance_type'],
1,
self._opts['ec2_master_instance_bid_price']
),
]
if self._opts['num_ec2_core_instances']:
args['instance_groups'].append(
self._create_instance_group(
'CORE',
self._opts['ec2_core_instance_type'],
self._opts['num_ec2_core_instances'],
self._opts['ec2_core_instance_bid_price']
)
)
if self._opts['num_ec2_task_instances']:
args['instance_groups'].append(
self._create_instance_group(
'TASK',
self._opts['ec2_task_instance_type'],
self._opts['num_ec2_task_instances'],
self._opts['ec2_task_instance_bid_price']
)
)
# bootstrap actions
bootstrap_action_args = []
for file_dict in self._bootstrap_actions:
# file_dict is not populated the same way by tools and real job
# runs, so use s3_uri or path as appropriate
s3_uri = file_dict.get('s3_uri', None) or file_dict['path']
bootstrap_action_args.append(
boto.emr.BootstrapAction(
file_dict['name'], s3_uri, file_dict['args']))
if self._master_bootstrap_script:
master_bootstrap_script_args = []
if self._opts['pool_emr_job_flows']:
master_bootstrap_script_args = [
'pool-' + self._pool_hash(),
self._opts['emr_job_flow_pool_name'],
]
bootstrap_action_args.append(
boto.emr.BootstrapAction(
'master', self._master_bootstrap_script['s3_uri'],
master_bootstrap_script_args))
if bootstrap_action_args:
args['bootstrap_actions'] = bootstrap_action_args
if self._opts['ec2_key_pair']:
args['ec2_keyname'] = self._opts['ec2_key_pair']
if self._opts['enable_emr_debugging']:
args['enable_debugging'] = True
if self._opts['additional_emr_info']:
args['additional_info'] = self._opts['additional_emr_info']
if persistent or self._opts['pool_emr_job_flows']:
args['keep_alive'] = True
if steps:
args['steps'] = steps
return args
def _build_steps(self):
"""Return a list of boto Step objects corresponding to the
steps we want to run."""
assert self._script # can't build steps if no script!
# figure out local names for our files
self._name_files()
self._pick_s3_uris_for_files()
# we're going to instruct EMR to upload the MR script and the
# wrapper script (if any) to the job's local directory
self._script['upload'] = 'file'
if self._wrapper_script:
self._wrapper_script['upload'] = 'file'
# quick, add the other steps before the job spins up and
# then shuts itself down (in practice this takes several minutes)
steps = self._get_steps()
step_list = []
version = self.get_hadoop_version()
for step_num, step in enumerate(steps):
# EMR-specific stuff
name = '%s: Step %d of %d' % (
self._job_name, step_num + 1, len(steps))
# don't terminate other people's job flows
if (self._opts['emr_job_flow_id'] or
self._opts['pool_emr_job_flows']):
action_on_failure = 'CANCEL_AND_WAIT'
else:
action_on_failure = 'TERMINATE_JOB_FLOW'
# Hadoop streaming stuff
if 'M' not in step: # if we have an identity mapper
mapper = 'cat'
else:
mapper = cmd_line(self._mapper_args(step_num))
if 'C' in step:
combiner = cmd_line(self._combiner_args(step_num))
else:
combiner = None
if 'R' in step: # i.e. if there is a reducer:
reducer = cmd_line(self._reducer_args(step_num))
else:
reducer = None
input = self._s3_step_input_uris(step_num)
output = self._s3_step_output_uri(step_num)\
step_args, cache_files, cache_archives = self._cache_args()
step_args.extend(self._hadoop_conf_args(step_num, len(steps)))
jar = self._get_jar()
if combiner is not None:
if compat.supports_combiners_in_hadoop_streaming(version):
step_args.extend(['-combiner', combiner])
else:
mapper = "bash -c '%s | sort | %s'" % (mapper, combiner)
streaming_step = boto.emr.StreamingStep(
name=name, mapper=mapper, reducer=reducer,
action_on_failure=action_on_failure,
cache_files=cache_files, cache_archives=cache_archives,
step_args=step_args, input=input, output=output,
jar=jar)
step_list.append(streaming_step)
return step_list
def _cache_args(self):
"""Returns ``(step_args, cache_files, cache_archives)``, populating
each according to the correct behavior for the current Hadoop version.
For < 0.20, populate cache_files and cache_archives.
For >= 0.20, populate step_args.
step_args should be inserted into the step arguments before anything
else.
cache_files and cache_archives should be passed as arguments to
StreamingStep.
"""
version = self.get_hadoop_version()
step_args = []
cache_files = []
cache_archives = []
if compat.supports_new_distributed_cache_options(version):
# boto doesn't support non-deprecated 0.20 options, so insert
# them ourselves
def escaped_paths(file_dicts):
# return list of strings to join with commas and pass to the
# hadoop binary
return ["%s#%s" % (fd['s3_uri'], fd['name'])
for fd in file_dicts]
# index by type
all_files = {}
for fd in self._files:
all_files.setdefault(fd.get('upload'), []).append(fd)
if 'file' in all_files:
step_args.append('-files')
step_args.append(','.join(escaped_paths(all_files['file'])))
if 'archive' in all_files:
step_args.append('-archives')
step_args.append(','.join(escaped_paths(all_files['archive'])))
else:
for file_dict in self._files:
if file_dict.get('upload') == 'file':
cache_files.append(
'%s#%s' % (file_dict['s3_uri'], file_dict['name']))
elif file_dict.get('upload') == 'archive':
cache_archives.append(
'%s#%s' % (file_dict['s3_uri'], file_dict['name']))
return step_args, cache_files, cache_archives
def _get_jar(self):
self._name_files()
self._pick_s3_uris_for_files()
if self._streaming_jar:
return self._streaming_jar['s3_uri']
else:
return self._opts['hadoop_streaming_jar_on_emr']
def _launch_emr_job(self):
"""Create an empty jobflow on EMR, and set self._emr_job_flow_id to
the ID for that job."""
self._create_s3_temp_bucket_if_needed()
# define out steps
steps = self._build_steps()
# try to find a job flow from the pool. basically auto-fill
# 'emr_job_flow_id' if possible and then follow normal behavior.
if self._opts['pool_emr_job_flows']:
job_flow = self.find_job_flow(num_steps=len(steps))
if job_flow:
self._emr_job_flow_id = job_flow.jobflowid
# create a job flow if we're not already using an existing one
if not self._emr_job_flow_id:
self._emr_job_flow_id = self._create_job_flow(
persistent=False, steps=steps)
else:
emr_conn = self.make_emr_conn()
log.info('Adding our job to job flow %s' % self._emr_job_flow_id)
log.debug('Calling add_jobflow_steps(%r, %r)' % (
self._emr_job_flow_id, steps))
emr_conn.add_jobflow_steps(self._emr_job_flow_id, steps)
# keep track of when we launched our job
self._emr_job_start = time.time()
def _wait_for_job_to_complete(self):
"""Wait for the job to complete, and raise an exception if
the job failed.
Also grab log URI from the job status (since we may not know it)
"""
success = False
while True:
# don't antagonize EMR's throttling
log.debug('Waiting %.1f seconds...' %
self._opts['check_emr_status_every'])
time.sleep(self._opts['check_emr_status_every'])
job_flow = self._describe_jobflow()
self._set_s3_job_log_uri(job_flow)
job_state = job_flow.state
reason = getattr(job_flow, 'laststatechangereason', '')
# find all steps belonging to us, and get their state
step_states = []
running_step_name = ''
total_step_time = 0.0
step_nums = [] # step numbers belonging to us. 1-indexed
steps = job_flow.steps or []
for i, step in enumerate(steps):
# ignore steps belonging to other jobs
if not step.name.startswith(self._job_name):
continue
step_nums.append(i + 1)
step.state = step.state
step_states.append(step.state)
if step.state == 'RUNNING':
running_step_name = step.name
if (hasattr(step, 'startdatetime') and
hasattr(step, 'enddatetime')):
start_time = iso8601_to_timestamp(step.startdatetime)
end_time = iso8601_to_timestamp(step.enddatetime)
total_step_time += end_time - start_time
if not step_states:
raise AssertionError("Can't find our steps in the job flow!")
# if all our steps have completed, we're done!
if all(state == 'COMPLETED' for state in step_states):
success = True
break
# if any step fails, give up
if any(state in ('FAILED', 'CANCELLED') for state in step_states):
break
# (the other step states are PENDING and RUNNING)
# keep track of how long we've been waiting
running_time = time.time() - self._emr_job_start
# otherwise, we can print a status message
if running_step_name:
log.info('Job launched %.1fs ago, status %s: %s (%s)' %
(running_time, job_state, reason, running_step_name))
if self._show_tracker_progress:
try:
tracker_handle = urllib2.urlopen(self._tracker_url)
tracker_page = ''.join(tracker_handle.readlines())
tracker_handle.close()
# first two formatted percentages, map then reduce
map_complete, reduce_complete = [float(complete)
for complete in JOB_TRACKER_RE.findall(
tracker_page)[:2]]
log.info(' map %3d%% reduce %3d%%' % (
map_complete, reduce_complete))
except:
log.error('Unable to load progress from job tracker')
# turn off progress for rest of job
self._show_tracker_progress = False
# once a step is running, it's safe to set up the ssh tunnel to
# the job tracker
job_host = getattr(job_flow, 'masterpublicdnsname', None)
if job_host and self._opts['ssh_tunnel_to_job_tracker']:
self.setup_ssh_tunnel_to_job_tracker(job_host)
# other states include STARTING and SHUTTING_DOWN
elif reason:
log.info('Job launched %.1fs ago, status %s: %s' %
(running_time, job_state, reason))
else:
log.info('Job launched %.1fs ago, status %s' %
(running_time, job_state,))
if success:
log.info('Job completed.')
log.info('Running time was %.1fs (not counting time spent waiting'
' for the EC2 instances)' % total_step_time)
self._fetch_counters(step_nums)
self.print_counters(range(1, len(step_nums) + 1))
else:
msg = 'Job failed with status %s: %s' % (job_state, reason)
log.error(msg)
if self._s3_job_log_uri:
log.info('Logs are in %s' % self._s3_job_log_uri)
# look for a Python traceback
cause = self._find_probable_cause_of_failure(step_nums)
if cause:
# log cause, and put it in exception
cause_msg = [] # lines to log and put in exception
cause_msg.append('Probable cause of failure (from %s):' %
cause['log_file_uri'])
cause_msg.extend(line.strip('\n') for line in cause['lines'])
if cause['input_uri']:
cause_msg.append('(while reading from %s)' %
cause['input_uri'])
for line in cause_msg:
log.error(line)
# add cause_msg to exception message
msg += '\n' + '\n'.join(cause_msg) + '\n'
raise Exception(msg)
def _script_args(self):
"""How to invoke the script inside EMR"""
# We can invoke the script by its S3 URL, but we don't really
# gain anything from that, and EMR is touchy about distinguishing
# python scripts from shell scripts
assert self._script # shouldn't call _script_args() if no script
args = self._opts['python_bin'] + [self._script['name']]
if self._wrapper_script:
args = (self._opts['python_bin'] +
[self._wrapper_script['name']] +
args)
return args
def _mapper_args(self, step_num):
return (self._script_args() +
['--step-num=%d' % step_num, '--mapper'] +
self._mr_job_extra_args())
def _reducer_args(self, step_num):
return (self._script_args() +
['--step-num=%d' % step_num, '--reducer'] +
self._mr_job_extra_args())
def _combiner_args(self, step_num):
return (self._script_args() +
['--step-num=%d' % step_num, '--combiner'] +
self._mr_job_extra_args())
def _upload_args(self):
"""Args to upload files from S3 to the local nodes that EMR runs
on."""
args = []
for file_dict in self._files:
if file_dict.get('upload') == 'file':
args.append('--cache')
args.append('%s#%s' % (file_dict['s3_uri'], file_dict['name']))
elif file_dict.get('upload') == 'archive':
args.append('--cache-archive')
args.append('%s#%s' % (file_dict['s3_uri'], file_dict['name']))
return args
def _s3_step_input_uris(self, step_num):
"""Get the s3:// URIs for input for the given step."""
if step_num == 0:
return self._s3_input_uris
else:
# put intermediate data in HDFS
return ['hdfs:///tmp/mrjob/%s/step-output/%s/' % (
self._job_name, step_num)]
def _s3_step_output_uri(self, step_num):
if step_num == len(self._get_steps()) - 1:
return self._output_dir
else:
# put intermediate data in HDFS
return 'hdfs:///tmp/mrjob/%s/step-output/%s/' % (
self._job_name, step_num + 1)
### LOG FETCHING/PARSING ###
def _enforce_path_regexp(self, paths, regexp, step_nums=None):
"""Helper for log fetching functions to filter out unwanted
logs. Only pass ``step_nums`` if ``regexp`` has a ``step_nums`` group.
"""
for path in paths:
m = regexp.match(path)
if (m and
(step_nums is None or
int(m.group('step_num')) in step_nums)):
yield path
else:
log.debug('Ignore %s' % path)
## SSH LOG FETCHING
def _ls_ssh_logs(self, relative_path):
"""List logs over SSH by path relative to log root directory"""
full_path = SSH_PREFIX + SSH_LOG_ROOT + '/' + relative_path
log.debug('Search %s for logs' % full_path)
return self.ls(full_path)
def _ls_slave_ssh_logs(self, addr, relative_path):
"""List logs over multi-hop SSH by path relative to log root directory
"""
root_path = '%s%s!%s%s' % (SSH_PREFIX,
self._address_of_master(),
addr,
SSH_LOG_ROOT + '/' + relative_path)
log.debug('Search %s for logs' % root_path)
return self.ls(root_path)
def ls_task_attempt_logs_ssh(self, step_nums):
all_paths = []
try:
all_paths.extend(self._ls_ssh_logs('userlogs/'))
except IOError:
# sometimes the master doesn't have these
pass
if not all_paths:
# get them from the slaves instead (takes a little longer)
try:
for addr in self._addresses_of_slaves():
logs = self._ls_slave_ssh_logs(addr, 'userlogs/')
all_paths.extend(logs)
except IOError:
# sometimes the slaves don't have them either
pass
return self._enforce_path_regexp(all_paths,
TASK_ATTEMPTS_LOG_URI_RE,
step_nums)
def ls_step_logs_ssh(self, step_nums):
return self._enforce_path_regexp(self._ls_ssh_logs('steps/'),
STEP_LOG_URI_RE,
step_nums)
def ls_job_logs_ssh(self, step_nums):
return self._enforce_path_regexp(self._ls_ssh_logs('history/'),
EMR_JOB_LOG_URI_RE,
step_nums)
def ls_node_logs_ssh(self):
all_paths = []
for addr in self._addresses_of_slaves():
logs = self._ls_slave_ssh_logs(addr, '')
all_paths.extend(logs)
return self._enforce_path_regexp(all_paths, NODE_LOG_URI_RE)
def ls_all_logs_ssh(self):
"""List all log files in the log root directory"""
return self.ls(SSH_PREFIX + SSH_LOG_ROOT)
## S3 LOG FETCHING ##
def _ls_s3_logs(self, relative_path):
"""List logs over S3 by path relative to log root directory"""
if not self._s3_job_log_uri:
self._set_s3_job_log_uri(self._describe_jobflow())
if not self._s3_job_log_uri:
raise LogFetchError('Could not determine S3 job log URI')
full_path = self._s3_job_log_uri + relative_path
log.debug('Search %s for logs' % full_path)
return self.ls(full_path)
def ls_task_attempt_logs_s3(self, step_nums):
return self._enforce_path_regexp(self._ls_s3_logs('task-attempts/'),
TASK_ATTEMPTS_LOG_URI_RE,
step_nums)
def ls_step_logs_s3(self, step_nums):
return self._enforce_path_regexp(self._ls_s3_logs('steps/'),
STEP_LOG_URI_RE,
step_nums)
def ls_job_logs_s3(self, step_nums):
return self._enforce_path_regexp(self._ls_s3_logs('jobs/'),
EMR_JOB_LOG_URI_RE,
step_nums)
def ls_node_logs_s3(self):
return self._enforce_path_regexp(self._ls_s3_logs('node/'),
NODE_LOG_URI_RE)
def ls_all_logs_s3(self):
"""List all log files in the S3 log root directory"""
if not self._s3_job_log_uri:
self._set_s3_job_log_uri(self._describe_jobflow())
return self.ls(self._s3_job_log_uri)
## LOG PARSING ##
def _fetch_counters(self, step_nums, skip_s3_wait=False):
"""Read Hadoop counters from S3.
Args:
step_nums -- the steps belonging to us, so that we can ignore counters
from other jobs when sharing a job flow
"""
self._counters = []
new_counters = {}
if self._opts['ec2_key_pair_file']:
try:
new_counters = self._fetch_counters_ssh(step_nums)
except LogFetchError:
new_counters = self._fetch_counters_s3(step_nums, skip_s3_wait)
except IOError:
# Can get 'file not found' if test suite was lazy or Hadoop
# logs moved. We shouldn't crash in either case.
new_counters = self._fetch_counters_s3(step_nums, skip_s3_wait)
else:
log.info('ec2_key_pair_file not specified, going to S3')
new_counters = self._fetch_counters_s3(step_nums, skip_s3_wait)
# step_nums is relative to the start of the job flow
# we only want them relative to the job
for step_num in step_nums:
self._counters.append(new_counters.get(step_num, {}))
def _fetch_counters_ssh(self, step_nums):
uris = list(self.ls_job_logs_ssh(step_nums))
log.info('Fetching counters from SSH...')
return scan_for_counters_in_files(uris, self,
self.get_hadoop_version())
def _fetch_counters_s3(self, step_nums, skip_s3_wait=False):
job_flow = self._describe_jobflow()
if job_flow.keepjobflowalivewhennosteps == 'true':
log.info("Can't fetch counters from S3 for five more minutes. Try"
" 'python -m mrjob.tools.emr.fetch_logs --counters %s'"
" in five minutes." % job_flow.jobflowid)
return {}
log.info('Fetching counters from S3...')
if not skip_s3_wait:
self._wait_for_s3_eventual_consistency()
self._wait_for_job_flow_termination()
try:
uris = self.ls_job_logs_s3(step_nums)
return scan_for_counters_in_files(uris, self,
self.get_hadoop_version())
except LogFetchError, e:
log.info("Unable to fetch counters: %s" % e)
return {}
def counters(self):
return self._counters
def _find_probable_cause_of_failure(self, step_nums):
"""Scan logs for Python exception tracebacks.
Args:
step_nums -- the numbers of steps belonging to us, so that we
can ignore errors from other jobs when sharing a job flow
Returns:
None (nothing found) or a dictionary containing:
lines -- lines in the log file containing the error message
log_file_uri -- the log file containing the error message
input_uri -- if the error happened in a mapper in the first
step, the URI of the input file that caused the error
(otherwise None)
"""
if self._opts['ec2_key_pair_file']:
try:
return self._find_probable_cause_of_failure_ssh(step_nums)
except LogFetchError:
return self._find_probable_cause_of_failure_s3(step_nums)
else:
log.info('ec2_key_pair_file not specified, going to S3')
return self._find_probable_cause_of_failure_s3(step_nums)
def _find_probable_cause_of_failure_ssh(self, step_nums):
task_attempt_logs = self.ls_task_attempt_logs_ssh(step_nums)
step_logs = self.ls_step_logs_ssh(step_nums)
job_logs = self.ls_job_logs_ssh(step_nums)
log.info('Scanning SSH logs for probable cause of failure')
return scan_logs_in_order(task_attempt_logs=task_attempt_logs,
step_logs=step_logs,
job_logs=job_logs,
runner=self)
def _find_probable_cause_of_failure_s3(self, step_nums):
log.info('Scanning S3 logs for probable cause of failure')
self._wait_for_s3_eventual_consistency()
self._wait_for_job_flow_termination()
task_attempt_logs = self.ls_task_attempt_logs_s3(step_nums)
step_logs = self.ls_step_logs_s3(step_nums)
job_logs = self.ls_job_logs_s3(step_nums)
return scan_logs_in_order(task_attempt_logs=task_attempt_logs,
step_logs=step_logs,
job_logs=job_logs,
runner=self)
### Bootstrapping ###
def _create_master_bootstrap_script(self, dest='b.py'):
"""Create the master bootstrap script and write it into our local
temp directory.
This will do nothing if there are no bootstrap scripts or commands,
or if _create_master_bootstrap_script() has already been called."""
# we call the script b.py because there's a character limit on
# bootstrap script names (or there was at one time, anyway)
if not any(key.startswith('bootstrap_') and value
for (key, value) in self._opts.iteritems()):
return
# don't bother if we're not starting a job flow
if self._opts['emr_job_flow_id']:
return
# Also don't bother if we're not pooling (and therefore don't need
# to have a bootstrap script to attach to) and we're not bootstrapping
# anything else
if not (self._opts['pool_emr_job_flows'] or
any(key.startswith('bootstrap_') and
key != 'bootstrap_actions' and # these are separate scripts
value
for (key, value) in self._opts.iteritems())):
return
if self._opts['bootstrap_mrjob']:
if self._mrjob_tar_gz_file is None:
self._mrjob_tar_gz_file = self._add_bootstrap_file(
self._create_mrjob_tar_gz() + '#')
# need to know what files are called
self._name_files()
self._pick_s3_uris_for_files()
path = os.path.join(self._get_local_tmp_dir(), dest)
log.info('writing master bootstrap script to %s' % path)
contents = self._master_bootstrap_script_content()
for line in StringIO(contents):
log.debug('BOOTSTRAP: ' + line.rstrip('\r\n'))
f = open(path, 'w')
f.write(contents)
f.close()
name, _ = self._split_path(path)
self._master_bootstrap_script = {'path': path, 'name': name}
self._files.append(self._master_bootstrap_script)
def _master_bootstrap_script_content(self):
"""Create the contents of the master bootstrap script.
This will give names and S3 URIs to files that don't already have them.
This function does NOT pick S3 URIs for files or anything like
that; _create_master_bootstrap_script() is responsible for that.
"""
out = StringIO()
python_bin_in_list = ', '.join(repr(opt) for opt in self._opts['python_bin'])
def writeln(line=''):
out.write(line + '\n')
# shebang
writeln('#!/usr/bin/python')
writeln()
# imports
writeln('from __future__ import with_statement')
writeln()
writeln('import distutils.sysconfig')
writeln('import os')
writeln('import stat')
writeln('from subprocess import call, check_call')
writeln('from tempfile import mkstemp')
writeln('from xml.etree.ElementTree import ElementTree')
writeln()
# download files using hadoop fs
writeln('# download files using hadoop fs -copyToLocal')
for file_dict in self._files:
if file_dict.get('bootstrap'):
writeln(
"check_call(['hadoop', 'fs', '-copyToLocal', %r, %r])" %
(file_dict['s3_uri'], file_dict['name']))
writeln()
# make scripts executable
if self._bootstrap_scripts:
writeln('# make bootstrap scripts executable')
for file_dict in self._bootstrap_scripts:
writeln("check_call(['chmod', 'a+rx', %r])" %
file_dict['name'])
writeln()
# bootstrap mrjob
if self._opts['bootstrap_mrjob']:
writeln('# bootstrap mrjob')
writeln("site_packages = distutils.sysconfig.get_python_lib()")
writeln(
"check_call(['sudo', 'tar', 'xfz', %r, '-C', site_packages])" %
self._mrjob_tar_gz_file['name'])
# re-compile pyc files now, since mappers/reducers can't
# write to this directory. Don't fail if there is extra
# un-compileable crud in the tarball.
writeln("mrjob_dir = os.path.join(site_packages, 'mrjob')")
writeln("call(["
"'sudo', %s, '-m', 'compileall', '-f', mrjob_dir])" %
python_bin_in_list)
writeln()
# install our python modules
if self._bootstrap_python_packages:
writeln('# install python modules:')
for file_dict in self._bootstrap_python_packages:
writeln("check_call(['tar', 'xfz', %r])" %
file_dict['name'])
# figure out name of dir to CD into
assert file_dict['path'].endswith('.tar.gz')
cd_into = extract_dir_for_tar(file_dict['path'])
# install the module
writeln("check_call(["
"'sudo', %s, 'setup.py', 'install'], cwd=%r)" %
(python_bin_in_list, cd_into))
# run our commands
if self._opts['bootstrap_cmds']:
writeln('# run bootstrap cmds:')
for cmd in self._opts['bootstrap_cmds']:
if isinstance(cmd, basestring):
writeln('check_call(%r, shell=True)' % cmd)
else:
writeln('check_call(%r)' % cmd)
writeln()
# run our scripts
if self._bootstrap_scripts:
writeln('# run bootstrap scripts:')
for file_dict in self._bootstrap_scripts:
writeln('check_call(%r)' % (
['./' + file_dict['name']],))
writeln()
return out.getvalue()
### EMR JOB MANAGEMENT UTILS ###
def make_persistent_job_flow(self):
"""Create a new EMR job flow that requires manual termination, and
return its ID.
You can also fetch the job ID by calling self.get_emr_job_flow_id()
"""
if (self._emr_job_flow_id):
raise AssertionError(
'This runner is already associated with job flow ID %s' %
(self._emr_job_flow_id))
log.info('Creating persistent job flow to run several jobs in...')
self._create_master_bootstrap_script()
self._upload_non_input_files()
# don't allow user to call run()
self._ran_job = True
self._emr_job_flow_id = self._create_job_flow(persistent=True)
return self._emr_job_flow_id
def get_emr_job_flow_id(self):
return self._emr_job_flow_id
def usable_job_flows(self, emr_conn=None, exclude=None, num_steps=1):
"""Get job flows that this runner can use.
We basically expect to only join available job flows with the exact
same setup as our own, that is:
- same bootstrap setup (including mrjob version)
- have the same Hadoop and AMI version
- same number and type of instances
However, we allow joining job flows where for each role, every instance
has at least as much memory as we require, and the total number of
compute units is at least what we require.
There also must be room for our job in the job flow (job flows top out
at 256 steps).
We then sort by:
- total compute units for core + task nodes
- total compute units for master node
- time left to an even instance hour
The most desirable job flows come *last* in the list.
:return: list of (job_minutes_float,
:py:class:`botoemr.emrobject.JobFlow`)
"""
emr_conn = emr_conn or self.make_emr_conn()
exclude = exclude or set()
req_hash = self._pool_hash()
# decide memory and total compute units requested for each
# role type
role_to_req_instance_type = {}
role_to_req_num_instances = {}
role_to_req_mem = {}
role_to_req_cu = {}
role_to_req_bid_price = {}
for role in ('core', 'master', 'task'):
instance_type = self._opts['ec2_%s_instance_type' % role]
if role == 'master':
num_instances = 1
else:
num_instances = self._opts['num_ec2_%s_instances' % role]
role_to_req_instance_type[role] = instance_type
role_to_req_num_instances[role] = num_instances
role_to_req_bid_price[role] = (
self._opts['ec2_%s_instance_bid_price' % role])
# unknown instance types can only match themselves
role_to_req_mem[role] = (
EC2_INSTANCE_TYPE_TO_MEMORY.get(instance_type, float('Inf')))
role_to_req_cu[role] = (
num_instances *
EC2_INSTANCE_TYPE_TO_COMPUTE_UNITS.get(instance_type,
float('Inf')))
sort_keys_and_job_flows = []
def add_if_match(job_flow):
# this may be a retry due to locked job flows
if job_flow.jobflowid in exclude:
return
# only take persistent job flows
if job_flow.keepjobflowalivewhennosteps != 'true':
return
# match pool name, and (bootstrap) hash
hash, name = pool_hash_and_name(job_flow)
if req_hash != hash:
return
if self._opts['emr_job_flow_pool_name'] != name:
return
# match hadoop version
if job_flow.hadoopversion != self.get_hadoop_version():
return
# match AMI version
job_flow_ami_version = getattr(job_flow, 'amiversion', None)
if job_flow_ami_version != self._opts['ami_version']:
return
# there is a hard limit of 256 steps per job flow
if len(job_flow.steps) + num_steps > MAX_STEPS_PER_JOB_FLOW:
return
# in rare cases, job flow can be WAITING *and* have incomplete
# steps
if any(getattr(step, 'enddatetime', None) is None
for step in job_flow.steps):
return
# total compute units per group
role_to_cu = defaultdict(float)
# total number of instances of the same type in each group.
# This allows us to match unknown instance types.
role_to_matched_instances = defaultdict(int)
# check memory and compute units, bailing out if we hit
# an instance with too little memory
for ig in job_flow.instancegroups:
role = ig.instancerole.lower()
# unknown, new kind of role; bail out!
if role not in ('core', 'master', 'task'):
return
req_instance_type = role_to_req_instance_type[role]
if ig.instancetype != req_instance_type:
# if too little memory, bail out
mem = EC2_INSTANCE_TYPE_TO_MEMORY.get(ig.instancetype, 0.0)
req_mem = role_to_req_mem.get(role, 0.0)
if mem < req_mem:
return
# if bid price is too low, don't count compute units
req_bid_price = role_to_req_bid_price[role]
bid_price = getattr(ig, 'bidprice', None)
# if the instance is on-demand (no bid price) or bid prices
# are the same, we're okay
if bid_price and bid_price != req_bid_price:
# whoops, we didn't want spot instances at all
if not req_bid_price:
continue
try:
if float(req_bid_price) > float(bid_price):
continue
except ValueError:
# we don't know what to do with non-float bid prices,
# and we know it's not equal to what we requested
continue
# don't require instances to be running; we'd be worse off if
# we started our own job flow from scratch. (This can happen if
# the previous job finished while some task instances were
# still being provisioned.)
cu = (int(ig.instancerequestcount) *
EC2_INSTANCE_TYPE_TO_COMPUTE_UNITS.get(
ig.instancetype, 0.0))
role_to_cu.setdefault(role, 0.0)
role_to_cu[role] += cu
# track number of instances of the same type
if ig.instancetype == req_instance_type:
role_to_matched_instances[role] += (
int(ig.instancerequestcount))
# check if there are enough compute units
for role, req_cu in role_to_req_cu.iteritems():
req_num_instances = role_to_req_num_instances[role]
# if we have at least as many units of the right type,
# don't bother counting compute units
if req_num_instances > role_to_matched_instances[role]:
cu = role_to_cu.get(role, 0.0)
if cu < req_cu:
return
# make a sort key
sort_key = (role_to_cu['core'] + role_to_cu['task'],
role_to_cu['master'],
est_time_to_hour(job_flow))
sort_keys_and_job_flows.append((sort_key, job_flow))
for job_flow in emr_conn.describe_jobflows(states=['WAITING']):
add_if_match(job_flow)
return [job_flow for (sort_key, job_flow)
in sorted(sort_keys_and_job_flows)]
def find_job_flow(self, num_steps=1):
"""Find a job flow that can host this runner. Prefer flows with more
compute units. Break ties by choosing flow with longest idle time.
Return ``None`` if no suitable flows exist.
"""
chosen_job_flow = None
exclude = set()
emr_conn = self.make_emr_conn()
s3_conn = self.make_s3_conn()
while chosen_job_flow is None:
sorted_tagged_job_flows = self.usable_job_flows(
emr_conn=emr_conn,
exclude=exclude,
num_steps=num_steps)
if sorted_tagged_job_flows:
job_flow = sorted_tagged_job_flows[-1]
status = attempt_to_acquire_lock(
s3_conn, self._lock_uri(job_flow),
self._opts['s3_sync_wait_time'], self._job_name)
if status:
return sorted_tagged_job_flows[-1]
else:
exclude.add(job_flow.jobflowid)
else:
return None
def _lock_uri(self, job_flow):
return make_lock_uri(self._opts['s3_scratch_uri'],
job_flow.jobflowid,
len(job_flow.steps) + 1)
def _pool_hash(self):
"""Generate a hash of the bootstrap configuration so it can be used to
match jobs and job flows. This first argument passed to the bootstrap
script will be ``'pool-'`` plus this hash.
"""
def should_include_file(info):
# Bootstrap scripts will always have a different checksum
if 'name' in info and info['name'] in ('b.py', 'wrapper.py'):
return False
# Also do not include script used to spin up job
if self._script and info['path'] == self._script['path']:
return False
# Only include bootstrap files
if 'bootstrap' not in info:
return False
# mrjob.tar.gz is covered by the bootstrap_mrjob variable.
# also, it seems to be different every time, causing an
# undesirable hash mismatch.
if (self._opts['bootstrap_mrjob']
and info is self._mrjob_tar_gz_file):
return False
# Ignore job-specific files
if info['path'] in self._input_paths:
return False
return True
# strip unique s3 URI if there is one
cleaned_bootstrap_actions = [dict(path=fd['path'], args=fd['args'])
for fd in self._bootstrap_actions]
things_to_hash = [
[self.md5sum(fd['path'])
for fd in self._files if should_include_file(fd)],
self._opts['additional_emr_info'],
self._opts['bootstrap_mrjob'],
self._opts['bootstrap_cmds'],
cleaned_bootstrap_actions,
]
if self._opts['bootstrap_mrjob']:
things_to_hash.append(mrjob.__version__)
return hash_object(things_to_hash)
### GENERAL FILESYSTEM STUFF ###
def du(self, path_glob):
"""Get the size of all files matching path_glob."""
if not is_s3_uri(path_glob):
return super(EMRJobRunner, self).du(path_glob)
return sum(self.get_s3_key(uri).size for uri in self.ls(path_glob))
def ls(self, path_glob):
"""Recursively list files locally or on S3.
This doesn't list "directories" unless there's actually a
corresponding key ending with a '/' (which is weird and confusing;
don't make S3 keys ending in '/')
To list a directory, path_glob must end with a trailing
slash (foo and foo/ are different on S3)
"""
if SSH_URI_RE.match(path_glob):
for item in self._ssh_ls(path_glob):
yield item
return
if not is_s3_uri(path_glob):
for path in super(EMRJobRunner, self).ls(path_glob):
yield path
return
# support globs
glob_match = GLOB_RE.match(path_glob)
# if it's a "file" (doesn't end with /), just check if it exists
if not glob_match and not path_glob.endswith('/'):
uri = path_glob
if self.get_s3_key(uri):
yield uri
return
# we're going to search for all keys starting with base_uri
if glob_match:
# cut it off at first wildcard
base_uri = glob_match.group(1)
else:
base_uri = path_glob
for uri in self._s3_ls(base_uri):
# enforce globbing
if glob_match and not fnmatch.fnmatchcase(uri, path_glob):
continue
yield uri
def _ssh_ls(self, uri):
"""Helper for ls(); obeys globbing"""
m = SSH_URI_RE.match(uri)
try:
addr = m.group('hostname') or self._address_of_master()
if '!' in addr:
self._enable_slave_ssh_access()
output = ssh_ls(
self._opts['ssh_bin'],
addr,
self._opts['ec2_key_pair_file'],
m.group('filesystem_path'),
self._ssh_key_name,
)
for line in output:
# skip directories, we only want to return downloadable files
if line and not line.endswith('/'):
yield SSH_PREFIX + addr + line
except SSHException, e:
raise LogFetchError(e)
def _s3_ls(self, uri):
"""Helper for ls(); doesn't bother with globbing or directories"""
s3_conn = self.make_s3_conn()
bucket_name, key_name = parse_s3_uri(uri)
bucket = s3_conn.get_bucket(bucket_name)
for key in bucket.list(key_name):
yield s3_key_to_uri(key)
def md5sum(self, path, s3_conn=None):
if is_s3_uri(path):
k = self.get_s3_key(path, s3_conn=s3_conn)
return k.etag.strip('"')
else:
return super(EMRJobRunner, self).md5sum(path)
def _cat_file(self, filename):
ssh_match = SSH_URI_RE.match(filename)
if is_s3_uri(filename):
# stream lines from the s3 key
s3_key = self.get_s3_key(filename)
buffer_iterator = read_file(s3_key_to_uri(s3_key), fileobj=s3_key)
return buffer_iterator_to_line_iterator(buffer_iterator)
elif ssh_match:
try:
addr = ssh_match.group('hostname') or self._address_of_master()
if '!' in addr:
self._enable_slave_ssh_access()
output = ssh_cat(
self._opts['ssh_bin'],
addr,
self._opts['ec2_key_pair_file'],
ssh_match.group('filesystem_path'),
self._ssh_key_name,
)
return read_file(filename, fileobj=StringIO(output))
except SSHException, e:
raise LogFetchError(e)
else:
# read from local filesystem
return super(EMRJobRunner, self)._cat_file(filename)
def mkdir(self, dest):
"""Make a directory. This does nothing on S3 because there are
no directories.
"""
if not is_s3_uri(dest):
super(EMRJobRunner, self).mkdir(dest)
def path_exists(self, path_glob):
"""Does the given path exist?
If dest is a directory (ends with a "/"), we check if there are
any files starting with that path.
"""
if not is_s3_uri(path_glob):
return super(EMRJobRunner, self).path_exists(path_glob)
# just fall back on ls(); it's smart
return any(self.ls(path_glob))
def path_join(self, dirname, filename):
if is_s3_uri(dirname):
return posixpath.join(dirname, filename)
else:
return os.path.join(dirname, filename)
def rm(self, path_glob):
"""Remove all files matching the given glob."""
if not is_s3_uri(path_glob):
return super(EMRJobRunner, self).rm(path_glob)
s3_conn = self.make_s3_conn()
for uri in self.ls(path_glob):
key = self.get_s3_key(uri, s3_conn)
if key:
log.debug('deleting ' + uri)
key.delete()
# special case: when deleting a directory, also clean up
# the _$folder$ files that EMR creates.
if uri.endswith('/'):
folder_uri = uri[:-1] + '_$folder$'
folder_key = self.get_s3_key(folder_uri, s3_conn)
if folder_key:
log.debug('deleting ' + folder_uri)
folder_key.delete()
def touchz(self, dest):
"""Make an empty file in the given location. Raises an error if
a non-empty file already exists in that location."""
if not is_s3_uri(dest):
super(EMRJobRunner, self).touchz(dest)
key = self.get_s3_key(dest)
if key and key.size != 0:
raise OSError('Non-empty file %r already exists!' % (dest,))
self.make_s3_key(dest).set_contents_from_string('')
### EMR-specific STUFF ###
def _wrap_aws_conn(self, raw_conn):
"""Wrap a given boto Connection object so that it can retry when
throttled."""
def retry_if(ex):
"""Retry if we get a server error indicating throttling. Also
handle spurious 505s that are thought to be part of a load
balancer issue inside AWS."""
return ((isinstance(ex, boto.exception.BotoServerError) and
('Throttling' in ex.body or
'RequestExpired' in ex.body or
ex.status == 505)) or
(isinstance(ex, socket.error) and
ex.args in ((104, 'Connection reset by peer'),
(110, 'Connection timed out'))))
return RetryWrapper(raw_conn,
retry_if=retry_if,
backoff=EMR_BACKOFF,
multiplier=EMR_BACKOFF_MULTIPLIER,
max_tries=EMR_MAX_TRIES)
def make_emr_conn(self):
"""Create a connection to EMR.
:return: a :py:class:`mrjob.boto_2_1_1_83aae37b.EmrConnection`, a
subclass of :py:class:`boto.emr.connection.EmrConnection`,
wrapped in a :py:class:`mrjob.retry.RetryWrapper`
"""
# ...which is then wrapped in bacon! Mmmmm!
# give a non-cryptic error message if boto isn't installed
if boto is None:
raise ImportError('You must install boto to connect to EMR')
region = self._get_region_info_for_emr_conn()
log.debug('creating EMR connection (to %s)' % region.endpoint)
raw_emr_conn = boto_2_1_1_83aae37b.EmrConnection(
aws_access_key_id=self._opts['aws_access_key_id'],
aws_secret_access_key=self._opts['aws_secret_access_key'],
region=region)
return self._wrap_aws_conn(raw_emr_conn)
def _get_region_info_for_emr_conn(self):
"""Get a :py:class:`boto.ec2.regioninfo.RegionInfo` object to
initialize EMR connections with.
This is kind of silly because all
:py:class:`boto.emr.connection.EmrConnection` ever does with
this object is extract the hostname, but that's how boto rolls.
"""
if self._opts['emr_endpoint']:
endpoint = self._opts['emr_endpoint']
else:
# look up endpoint in our table
try:
endpoint = REGION_TO_EMR_ENDPOINT[self._aws_region]
except KeyError:
raise Exception(
"Don't know the EMR endpoint for %s;"
" try setting emr_endpoint explicitly" % self._aws_region)
return boto.ec2.regioninfo.RegionInfo(None, self._aws_region, endpoint)
def _describe_jobflow(self, emr_conn=None):
emr_conn = emr_conn or self.make_emr_conn()
return emr_conn.describe_jobflow(self._emr_job_flow_id)
def get_hadoop_version(self):
if not self._inferred_hadoop_version:
if self._emr_job_flow_id:
# if joining a job flow, infer the version
self._inferred_hadoop_version = (
self._describe_jobflow().hadoopversion)
else:
# otherwise, read it from hadoop_version/ami_version
hadoop_version = self._opts['hadoop_version']
if hadoop_version:
self._inferred_hadoop_version = hadoop_version
else:
ami_version = self._opts['ami_version']
# don't explode if we see an AMI version that's
# newer than what we know about.
self._inferred_hadoop_version = (
AMI_VERSION_TO_HADOOP_VERSION.get(ami_version) or
AMI_VERSION_TO_HADOOP_VERSION['latest'])
return self._inferred_hadoop_version
def _address_of_master(self, emr_conn=None):
"""Get the address of the master node so we can SSH to it"""
# cache address of master to avoid redundant calls to describe_jobflow
# also convenient for testing (pretend we can SSH when we really can't
# by setting this to something not False)
if self._address:
return self._address
try:
jobflow = self._describe_jobflow(emr_conn)
if jobflow.state not in ('WAITING', 'RUNNING'):
raise LogFetchError(
'Cannot ssh to master; job flow is not waiting or running')
except boto.exception.S3ResponseError:
# This error is raised by mockboto when the jobflow doesn't exist
raise LogFetchError('Could not get job flow information')
self._address = jobflow.masterpublicdnsname
return self._address
def _addresses_of_slaves(self):
if not self._ssh_slave_addrs:
self._ssh_slave_addrs = ssh_slave_addresses(
self._opts['ssh_bin'],
self._address_of_master(),
self._opts['ec2_key_pair_file'])
return self._ssh_slave_addrs
### S3-specific FILESYSTEM STUFF ###
# Utilities for interacting with S3 using S3 URIs.
# Try to use the more general filesystem interface unless you really
# need to do something S3-specific (e.g. setting file permissions)
def make_s3_conn(self):
"""Create a connection to S3.
:return: a :py:class:`boto.s3.connection.S3Connection`, wrapped in a
:py:class:`mrjob.retry.RetryWrapper`
"""
# give a non-cryptic error message if boto isn't installed
if boto is None:
raise ImportError('You must install boto to connect to S3')
s3_endpoint = self._get_s3_endpoint()
log.debug('creating S3 connection (to %s)' % s3_endpoint)
raw_s3_conn = boto.connect_s3(
aws_access_key_id=self._opts['aws_access_key_id'],
aws_secret_access_key=self._opts['aws_secret_access_key'],
host=s3_endpoint)
return self._wrap_aws_conn(raw_s3_conn)
def _get_s3_endpoint(self):
if self._opts['s3_endpoint']:
return self._opts['s3_endpoint']
else:
# look it up in our table
try:
return REGION_TO_S3_ENDPOINT[self._aws_region]
except KeyError:
raise Exception(
"Don't know the S3 endpoint for %s;"
" try setting s3_endpoint explicitly" % self._aws_region)
def get_s3_key(self, uri, s3_conn=None):
"""Get the boto Key object matching the given S3 uri, or
return None if that key doesn't exist.
uri is an S3 URI: ``s3://foo/bar``
You may optionally pass in an existing s3 connection through
``s3_conn``.
"""
if not s3_conn:
s3_conn = self.make_s3_conn()
bucket_name, key_name = parse_s3_uri(uri)
return s3_conn.get_bucket(bucket_name).get_key(key_name)
def make_s3_key(self, uri, s3_conn=None):
"""Create the given S3 key, and return the corresponding
boto Key object.
uri is an S3 URI: ``s3://foo/bar``
You may optionally pass in an existing S3 connection through
``s3_conn``.
"""
if not s3_conn:
s3_conn = self.make_s3_conn()
bucket_name, key_name = parse_s3_uri(uri)
return s3_conn.get_bucket(bucket_name).new_key(key_name)
def get_s3_keys(self, uri, s3_conn=None):
"""Get a stream of boto Key objects for each key inside
the given dir on S3.
uri is an S3 URI: ``s3://foo/bar``
You may optionally pass in an existing S3 connection through s3_conn
"""
if not s3_conn:
s3_conn = self.make_s3_conn()
bucket_name, key_prefix = parse_s3_uri(uri)
bucket = s3_conn.get_bucket(bucket_name)
for key in bucket.list(key_prefix):
yield key
def get_s3_folder_keys(self, uri, s3_conn=None):
"""Background: S3 is even less of a filesystem than HDFS in that it
doesn't have directories. EMR fakes directories by creating special
``*_$folder$`` keys in S3.
For example if your job outputs ``s3://walrus/tmp/output/part-00000``,
EMR will also create these keys:
- ``s3://walrus/tmp_$folder$``
- ``s3://walrus/tmp/output_$folder$``
If you want to grant another Amazon user access to your files so they
can use them in S3, you must grant read access on the actual keys,
plus any ``*_$folder$`` keys that "contain" your keys; otherwise
EMR will error out with a permissions error.
This gets all the ``*_$folder$`` keys associated with the given URI,
as boto Key objects.
This does not support globbing.
You may optionally pass in an existing S3 connection through
``s3_conn``.
"""
if not s3_conn:
s3_conn = self.make_s3_conn()
bucket_name, key_name = parse_s3_uri(uri)
bucket = s3_conn.get_bucket(bucket_name)
dirs = key_name.split('/')
for i in range(len(dirs)):
folder_name = '/'.join(dirs[:i]) + '_$folder$'
key = bucket.get_key(folder_name)
if key:
yield key
|