This file is indexed.

/usr/share/pyshared/cogent/parse/ebi.py is in python-cogent 1.5.1-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
#!/usr/bin/env python
"""Provide a parser for SwissProt EBI format files.
"""
import sys

from string import maketrans, strip, rstrip
from pprint import pprint, pformat
from cogent.parse.record_finder import DelimitedRecordFinder,\
    LabeledRecordFinder, is_empty, TailedRecordFinder
from cogent.parse.record import RecordError, FieldError
from cogent.util.misc import identity, curry,\
        NestedSplitter, list_flatten
from cogent.core.sequence import Sequence

__author__ = "Zongzhi Liu and Sandra Smit"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Zongzhi Liu", "Sandra Smit", "Rob Knight", "Gavin Huttley"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Zongzhi Liu"
__email__ = "zongzhi.liu@gmail.com"
__status__ = "Development"

all_chars = maketrans('','')

def rstrip_(chars=None):
    return curry(rstrip, chars=chars) 

EbiFinder = DelimitedRecordFinder('//', 
        constructor=rstrip)

no_indent = lambda s: not s.startswith(' ')
hanging_paragraph_finder = LabeledRecordFinder(no_indent, constructor=None) 

endswith_period = lambda x: x.endswith('.')
period_tail_finder = TailedRecordFinder(endswith_period)


#################################
# pairs_to_dict 
def pairs_to_dict(key_values, dict_mode=None, 
        all_keys=None, handlers={}, default_handler=None):
    """generate a function which return a dict from a sequence of key_value
    pairs. 

    key_values: (key, value) pairs, from any sequence type. Example:
    [('a', 1), ('b', 2), ('b', 3)]

    dict_mode: one of four modes to build a dict from pairs.
    'overwrite_value': default, same as dict(key_values) 
    the key_values example get {'a': 1, 'b': 3}
    'no_duplicated_key': raise error when there is duplicated key;
    or a dict with a list of values when there is duplicated key
    'allow_muti_value': a duplicated key will have a list of values, 
    the example get {'a': 1, 'b': [2, 3]}
    'always_multi_value': always group value(s) into a list for each key; 
    the example get {'a': [1], 'b': [2, 3]}

    all_keys: if a key not found in all_keys, raise error; recommend to use a
    dict for all_keys for efficiency.

    Each value will be converted, if a valid handler can be found in handlers
    or a default_handler is provided. 
    handler = handlers.get(key, default_handler)

    When handlers provided but no valid handler is found for a key: raise
    ValueError.
    Always use original value, if no handlers provided, for example, 
    pairs_to_dict(adict.items()) will return adict.

    Note: use default_handler=identity is often useful if you want to return
    the original value when no handler found.
    """
    if not dict_mode: dict_mode = 'overwrite_value'

    #generate add_item for different dict_mode.
    if dict_mode=='always_multi_value':
        def add_item(dictionary, key, value):
            """add key, value to dictionary in place"""
            dictionary.setdefault(key, []).append(value)

    elif dict_mode=='allow_multi_value':
        multiples = {} #auxillary dict recording the keys with multi_values
        def add_item(dictionary, key, value):
            """add key, value to dictionary in place
            
            Warning: using outer auxillary dictionary: multiples"""
            if key in dictionary:
                if not key in multiples:
                    multiples[key] = True 
                    dictionary[key] = [dictionary[key]]
                dictionary[key].append(value)
            else:
                dictionary[key] = value

    elif dict_mode=='no_duplicated_key':
        def add_item(dictionary, key, value):
            """add key, value to dictionary in place"""
            if key in dictionary:
                raise ValueError('Duplicated Key')
            dictionary[key] = value

    elif dict_mode=='overwrite_value':
        def add_item(dictionary, key, value):
            """add key, value to dictionary in place"""
            dictionary[key] = value
    else: # unknown dict_mode
        raise ValueError('Unknown dict_mode:%s. \ndict_mode must be one of '
                'overwrite_value, no_duplicated_key, allow_multi_value and '
                'always_multi_value.' % dict_mode)
        
    #generate the handle_value function. 
    if not handlers and not default_handler:
        handle_value = lambda x, y: (x, y) 

    else:  #handlers not empty, 
        def handle_value(key, raw_value):
            handler = handlers.get(key, default_handler)
            if handler:
                value = handler(raw_value)
            else: #no handler found for key
                raise ValueError('No handler found for %s' % key)
            return key, value

    #build the result dict.
    result = {}    
    for key, raw_value in key_values:
        if all_keys and key not in all_keys:
            raise ValueError('key: %s not in all_keys: %s' 
                    % (repr(key), all_keys))
        key, value = handle_value(key, raw_value)
        add_item(result, key, value)
    return result

#################################
# generic parsers

def linecode_maker(line):
    """return the linecode and the line.
    
    The two-character line code that begins each line is always followed
    by three blanks, so that the actual information begins with the sixth
    character."""
    linecode = line.split('   ', 1)[0]
    return linecode, line

def labeloff(lines, splice_from=5):
    """strip off the first splice_from characters from each line
    
    Warning: without check!"""
    return [line[splice_from:] for line in lines]

def join_parser(lines, join_str=' ', chars_to_strip=' ;.'):
    """return a joined str from a list of lines, strip off chars requested from
    the joined str"""
    #a str will not be joined
    if isinstance(lines, basestring):
        result = lines
    else:
        result = join_str.join(lines)

    return result.strip(chars_to_strip)
    
def join_split_parser(lines, delimiters=';', item_modifier=strip, 
        same_level=False, **kwargs):
    """return a nested list from lines, join lines before using NestedSplitter.

    delimiters: delimiters used by NestedSplitter
    item_modifier: passed to NestedSplitter, modify each splitted item.
    kwargs: passed to join_parser
    
    Examples:
    join_split_parser(['aa; bb;', 'cc.']) -> ['aa', 'bb', 'cc']
    join_split_parser(['aa; bb, bbb;', 'cc.'], delimiters=';,') 
    -> ['aa', ['bb','bbb'], 'cc']
    join_split_parser('aa (bb) (cc).', delimiters='(', 
    item_modifer=rstrip_(')) -> ['aa','bb','cc']
    """
    result = join_parser(lines, **kwargs)
    
    return NestedSplitter(delimiters, 
        constructor=item_modifier, same_level=same_level)(result)

def join_split_dict_parser(lines, delimiters=[';', ('=',1), ','], 
        dict_mode=None, strict=True, **kwargs):
    """return a dict from lines, using the splited pairs from
    join_split_parser and pairs_to_dict.

    delimiters, kwargs: pass to join_split_sparser
    strict: when dict() fails -- (a pair not got from the second delimiter).
    return unconstructed list when False or raise error when True (default).

    dict_mode: pass to pairs_to_dict.  if leave as None, will be
    'overwrite_value', which is same as dict(pairs).

    Examples:
    join_split_dict_parser(['aa=1; bb=2,3; cc=4 (if aa=1);'])
    -> {'aa':'1', 'bb': ['2','3'], 'cc': '4 (if aa=1)'}
    """
    primary_delimiters, value_delimiters = delimiters[:2], delimiters[2:]
    pairs = join_split_parser(lines, delimiters=primary_delimiters, 
            same_level=True, **kwargs)

    try:
        dict(pairs) #catch error for any not splitted pair. 
    except ValueError, e: #dictionary update sequence element #1 has length 1;
        if strict:
            raise ValueError('e\nFailed to get a dict from pairs: %s' % pairs)
        else:
            #return the splitted list without constucting
            return pairs  

    if value_delimiters:
        split_value = NestedSplitter(value_delimiters, same_level=False) 
        #should raise ValueError here if a pair donot have two elems.
        for i,(k, v) in enumerate(pairs):
            v = split_value(v)
            #modify v only if splitted by the first dilimiter
            if len(v) > 1:
                pairs[i][1] = v

    result = pairs_to_dict(pairs, dict_mode)
    return result


def mapping_parser(line, fields, delimiters=[';', None],
        flatten=list_flatten):
    """return a dict of zip(fields, splitted line), None key will be deleted
    from the result dict.

    line: should be a str,  to be splitted.
    fields: field name and optional type constructor for mapping.  example:
        ['EntryName', ('Length', int), 'MolType'] 
    delimiters: separators used to split the line.
    flatten: a function used to flatten the list from nested splitting.
    """
    splits = NestedSplitter(delimiters=delimiters)(line)
    values = flatten(splits)
    result = {}
    for f, v in zip(fields, values):
        if isinstance(f, (tuple, list)):
            name, type = f
            result[name] = type(v)
        else:
            result[f] = v

    if None in result:
        del result[None]
    
    return result


#################################
# individual parsers
#################################

#################################
# mapping parsers: id, sq
#
def id_parser(lines):
    """return a mapping dict from id lines (only one line).

    The ID (IDentification) line is always the first line of an entry. The general
    form of the ID line is:

    ID   ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; SEQUENCE_LENGTH.
    Example:
    ID   CYC_BOVIN      STANDARD;      PRT;   104 AA.
    """
    lines = labeloff(lines)
    return mapping_parser(lines[0], delimiters=[';',None],
        fields=('EntryName','DataClass','MolType',('Length',int)))

def sq_parser(lines):
    """return a mapping dict from SQ lines (only one line).

    The SQ (SeQuence header) line marks the beginning of the sequence data and 
    gives a quick summary of its content.  The format of the SQ line is:

    SQ   SEQUENCE XXXX AA; XXXXX MW; XXXXXXXXXXXXXXXX CRC64;

    The line contains the length of the sequence in amino acids ('AA')
    followed by the molecular weight ('MW') rounded to the nearest mass unit
    (Dalton) and the sequence 64-bit CRC (Cyclic Redundancy Check) value 
    ('CRC64').
    """
    lines = labeloff(lines)
    return mapping_parser(lines[0], delimiters=[';',None],
        fields=(None, ('Length', int), None, ('MolWeight',int), None, 'Crc64'))

def kw_parser(lines):
    """return a list of keywords from KW lines.

    The format of the KW line is: 
    KW   Keyword[; Keyword...].
    """
    lines = labeloff(lines)
    return join_split_parser(lines)

def ac_parser(lines):
    """return a list of accession numbers from AC lines.

    The AC (ACcession number) line lists the accession number(s) associated 
    with an entry. The format of the AC line is:
    AC   AC_number_1;[ AC_number_2;]...[ AC_number_N;]

    The first accession number is commonly referred to as the 'primary
    accession number'. 'Secondary accession numbers' are sorted
    alphanumerically.
    """
    lines = labeloff(lines)
    return join_split_parser(lines)

def dt_parser(lines):
    """return the origal lines from DT lines.
    
    Note: not complete parsing

    The DT (DaTe) lines show the date of creation and last modification of the
    database entry.
    The format of the DT line in Swiss-Prot is:
        DT   DD-MMM-YYYY (Rel. XX, Comment)
    The format of the DT line in TrEMBL is:
        DT   DD-MMM-YYYY (TrEMBLrel. XX, Comment)

    There are always three DT lines in each entry, each of them is associated
    with a specific comment:
    * The first DT line indicates when the entry first appeared in the
      database. The comment is 'Created';
    * The second DT line indicates when the sequence data was last modified.
      comment is 'Last sequence update';
    * The third DT line indicates when data (see the note below) other than the
      sequence was last modified. 'Last annotation update'.

    Example of a block of Swiss-Prot DT lines:
        DT   01-AUG-1988 (Rel. 08, Created)
        DT   30-MAY-2000 (Rel. 39, Last sequence update)
        DT   10-MAY-2005 (Rel. 47, Last annotation update)
    """
    lines = labeloff(lines)
    return lines 

#################################
# gn_parser
def gn_parser(lines):
    """return a list of dict from GN lines.
    
    The GN (Gene Name) line indicates the name(s) of the gene(s) that code for
    the stored protein sequence. The GN line contains three types of
    information: Gene names, Ordered locus names, ORF names. format:

    GN   Name=<name>; Synonyms=<name1>[, <name2>...];
    OrderedLocusNames=<name1>[, <name2>...];
    GN   ORFNames=<name1>[, <name2>...];

    None of the above four tokens are mandatory. But a "Synonyms" token can
    only be present if there is a "Name" token.

    If there is more than one gene, GN line blocks for the different genes are 
    separated by the following line:
    GN   and
    Example:
    GN   Name=Jon99Cii; Synonyms=SER1, SER5, Ser99Da; ORFNames=CG7877;
    GN   and
    GN   Name=Jon99Ciii; Synonyms=SER2, SER5, Ser99Db; ORFNames=CG15519;"""
    lines = labeloff(lines)
    return map(gn_itemparser, gn_itemfinder(lines))

gn_itemparser = join_split_dict_parser 

gn_itemfinder = DelimitedRecordFinder('and', constructor=None, strict=False,
        keep_delimiter=False)

def oc_parser(lines):
    """return a list from OC lines.

    The OC (Organism Classification) lines contain the taxonomic classification
    of the source organism.  The classification is listed top-down as nodes in
    a taxonomic tree in which the most general grouping is given first. format:

    OC   Node[; Node...].
    """
    lines = labeloff(lines)
    return join_split_parser(lines)

def os_parser(lines):
    """return a list from OS lines.
    
    OS (Organism Species) line specifies the organism which was the source of
    the stored sequence.  The last OS line is terminated by a period.

    The species designation consists, in most cases, of the Latin genus and
    species designation followed by the English name (in parentheses). For
    viruses, only the common English name is given.

    Examples of OS lines are shown here:
        OS   Escherichia coli.
        OS   Solanum melongena (Eggplant) (Aubergine).
        OS   Rous sarcoma virus (strain SchRuppin A) (RSV-SRA) (Avian leukosis
        OS   virus-RSA).
    """
    lines = labeloff(lines)
    return join_split_parser(lines, 
        delimiters='(', item_modifier=rstrip_(') ')) 
        
def ox_parser(lines):
    """return a dict from OX lines.

    The OX (Organism taxonomy cross-reference) line is used to indicate the 
    identifier of a specific organism in a taxonomic database. The format:
    OX   Taxonomy_database_Qualifier=Taxonomic code;

    Currently the cross-references are made to the taxonomy database of NCBI, 
    which is associated with the qualifier 'TaxID' and a one- to six-digit 
    taxonomic code."""
    lines = labeloff(lines)
    return join_split_dict_parser(lines)

def og_parser(lines):
    """return a list from OG lines
    The OG (OrGanelle) line indicates if the gene coding for a protein
    originates from the mitochondria, the chloroplast, the cyanelle, the
    nucleomorph or a plasmid.  The format of the OG line is:

    OG   Hydrogenosome.
    OG   Mitochondrion.
    OG   Nucleomorph.
    OG   Plasmid name.
    OG   Plastid.
    OG   Plastid; Apicoplast.
    OG   Plastid; Chloroplast.
    OG   Plastid; Cyanelle.
    OG   Plastid; Non-photosynthetic plastid.
    Where 'name' is the name of the plasmid. example:

    OG   Mitochondrion.
    OG   Plasmid R6-5, Plasmid IncFII R100 (NR1), and
    OG   Plasmid IncFII R1-19 (R1 drd-19)."""
    lines = labeloff(lines)
    result = []
    for item in period_tail_finder(lines):
        item = ' '.join(item).rstrip('. ')
        if item.startswith('Plasmid'):
            item = item.replace(' and', '')
            item = map(strip, item.split(','))
        result.append(item)
    return result


#################################
# dr_parser
def dr_parser(lines):
    """return a dict of items from DR lines.

    The DR (Database cross-Reference) lines are used as pointers to information
    related to entries and found in data collections other than Swiss-Prot. 
    The format of one of many DR line is:

    DR   DATABASE_IDENTIFIER; PRIMARY_IDENTIFIER; SECONDARY_IDENTIFIER[;
    TERTIARY_IDENTIFIER][; QUATERNARY_IDENTIFIER].
    """
    lines = labeloff(lines)
    keyvalues = map(dr_itemparser, period_tail_finder(lines))
    result = pairs_to_dict(keyvalues, 'always_multi_value')
    return result

def dr_itemparser(lines):
    """return a key, value pair from lines of a DR item.
    """
    fields = join_split_parser(lines)
    return fields[0], fields[1:]

#################################
# de_parser
def de_parser(lines):
    """return a dict of {OfficalName: str, Synonyms: str, Fragment: bool,
    Contains: [itemdict,],  Includes: [itemdict,]} from DE lines
    
    The DE (DEscription) lines contain general descriptive information about
    the sequence stored. This information is generally sufficient to identify
    the protein precisely.

    The description always starts with the proposed official name of the
    protein. Synonyms are indicated between brackets. Examples below

    If a protein is known to be cleaved into multiple functional components,
    the description starts with the name of the precursor protein, followed by
    a section delimited by '[Contains: ...]'. All the individual components are
    listed in that section and are separated by semi-colons (';'). Synonyms are
    allowed at the level of the precursor and for each individual component.
    
    If a protein is known to include multiple functional domains each of which
    is described by a different name, the description starts with the name of
    the overall protein, followed by a section delimited by '[Includes: ]'. All
    the domains are listed in that section and are separated by semi-colons
    (';'). Synonyms are allowed at the level of the protein and for each
    individual domain. 

    In rare cases, the functional domains of an enzyme are cleaved, but the
    catalytic activity can only be observed, when the individual chains
    reorganize in a complex. Such proteins are described in the DE line by a
    combination of both '[Includes:...]' and '[Contains:...]', in the order
    given in the following example:

    If the complete sequence is not determined, the last information given on
    the DE lines is '(Fragment)' or '(Fragments)'. Example:

    DE   Dihydrodipicolinate reductase (EC 1.3.1.26) (DHPR) (Fragment).

    DE   Arginine biosynthesis bifunctional protein argJ [Includes: Glutamate
    DE   N-acetyltransferase (EC 2.3.1.35) (Ornithine acetyltransferase)
    DE   (Ornithine transacetylase) (OATase); Amino-acid acetyltransferase
    DE   (EC 2.3.1.1) (N-acetylglutamate synthase) (AGS)] [Contains: Arginine
    DE   biosynthesis bifunctional protein argJ alpha chain; Arginine
    DE   biosynthesis bifunctional protein argJ beta chain] (Fragment).

    Trouble maker:
    DE Amiloride-sensitive amine oxidase [copper-containing] precursor(EC
    DE 1.4.3.6) (Diamine oxidase) (DAO).
    """
    labeloff_lines = labeloff(lines)
    joined = join_parser(labeloff_lines, chars_to_strip='). ')

    keys = ['Includes', 'Contains', 'Fragment']
    fragment_label = '(Fragment'
    contains_label = '[Contains:'
    includes_label = '[Includes:'

    #Process Fragment
    fragment = False
    if joined.endswith(fragment_label):
        fragment = True
        joined = joined.rsplit('(', 1)[0]
        
    #Process Contains
    contains = []
    if contains_label in joined:
        joined, contains_str = joined.split(contains_label)
        contains_str = contains_str.strip(' ]')
        contains = map(de_itemparser, contains_str.split('; '))
    #Process Includes
    includes = []
    if includes_label in joined:
        joined, includes_str = joined.split(includes_label)
        includes_str = includes_str.strip(' ]')
        includes = map(de_itemparser, includes_str.split('; '))

    #Process Primary 
    primary = de_itemparser(joined)

    result = dict(zip(keys, (includes, contains, fragment)))
    result.update(primary)
    return result

def de_itemparser(line):
    """return a dict of {OfficalName: str, Synonyms: [str,]} from a de_item
    
    The description item is a str, always starts with the proposed official
    name of the protein. Synonyms are indicated between brackets. Examples
    below
    
    'Annexin A5 (Annexin V) (Lipocortin V) (Endonexin II)'
    """
    fieldnames = ['OfficalName', 'Synonyms']
    fields = [e.strip(') ') for e in line.split('(')]
    #if no '(', fields[1:] will be []
    return dict(zip(fieldnames, [fields[0], fields[1:]]))
    

#################################
# ft_parser
def ft_parser(lines):
    """return a list of ft items from FT lines.

    The FT (Feature Table) lines lists posttranslational modifications, binding
    sites, enzyme active sites, local secondary structure or other
    characteristics reported in the cited references. Sequence conflicts
    between references are also included in the feature table.
    
    The FT lines have a fixed format. The column numbers allocated to each of
    the data items within each FT line are shown in the following table (column
    numbers not referred to in the table are always occupied by blanks).

        Columns     Data item
        1-2     FT
        6-13    Key name
        15-20   'From' endpoint
        22-27   'To' endpoint
        35-75   Description

    The key name and the endpoints are always on a single line, but the
    description may require one or more additional lines. The following 
    description lines continues from column 35 onwards.  For more information
    about individual ft keys, see 
    http://us.expasy.org/sprot/userman.html#FT_keys

    'FROM' and 'TO' endpoints: Numbering start from 1; When a feature is known
    to extend beyond the position that is given in the feature table, the
    endpoint specification will be preceded by '<' for features which continue
    to the left end (N-terminal direction) or by '>' for features which
    continue to the right end (C- terminal direction); Unknown endpoints are
    denoted by '?'. Uncertain endpoints are denoted by a '?' before the
    position, e.g. '?42'.

    Some features (CARBOHYD, CHAIN, PEPTIDE, PROPEP, VARIANT and VARSPLIC) are
    associated with a unique and stable feature identifier (FTId), which allows
    to construct links directly from position-specific annotation in the
    feature table to specialized protein-related databases.  The FTId is always
    the last component of a feature in the description field. 

    Examples:
        FT   SIGNAL       <1     10       By similarity.
        FT   MOD_RES      41     41       Arginine amide (G-42 provides amide
        FT                                group) (By similarity).
        FT   CONFLICT    327    327       E -> R (in Ref. 2).
        FT   CONFLICT     77     77       Missing (in Ref. 1).
        FT   CARBOHYD    251    251       N-linked (GlcNAc...).
        FT                                /FTId=CAR_000070.
        FT   PROPEP       25     48
        FT                                /FTId=PRO_0000021449.
        FT   VARIANT     214    214       V -> I.
        FT                                /FTId=VAR_009122.
        FT   VARSPLIC     33     83       TVGRFRRRATP -> PLTSFHPFTSQMPP (in
        FT                                isoform 2).
        FT                                /FTId=VSP_004370.

    Secondary structure (HELIX, STRAND, TURN) - The feature table of sequence
    entries of proteins whose tertiary structure is known experimentally
    contains the secondary structure information extracted from the coordinate
    data sets of the Protein Data Bank (PDB).  Residues not specified in one of
    these classes are in a 'loop' or 'random-coil' structure. 
    """
    lines = labeloff(lines)
    fieldnames = 'Start End Description'.split()
    secondary_structure_keynames = 'HELIX STRAND TURN'.split()
    result = {}
    for item in hanging_paragraph_finder(lines):
        keyname, start, end, description = ft_basic_itemparser(item)

        #group secondary structures (as a list) into
        #result['SecondaryStructure']
        if keyname in secondary_structure_keynames:
            result.setdefault('SecondaryStructure', []).\
                    append((keyname, start, end))
            continue

        #further parser the description for certain keynames
        if keyname in ft_description_parsers:
            description = ft_description_parsers[keyname](description)

        #group current item result (as a dict) into result[keyname]
        curr = dict(zip(fieldnames, [start, end, description]))
        result.setdefault(keyname, []).  append(curr)
    return result

def ft_basic_itemparser(item_lines):
    """-> (key, start, end, description) from lines of a feature item.
    
    A feature item (generated by itemfinder) has the same keyname.
    
    WARNING: not complete, location fields need further work?
    """
    #cut_postions: the postions to split the line into fields
    original_cut_positions = [15,22,35] #see doc of ft_parser
    #keyname will start from 0(instead of 6) after labeloff
    cut_positions = [e - 6 for e in original_cut_positions]

    #unpack the first line to fields
    first_line = item_lines[0]
    keyname, from_point, to_point, description = \
            [first_line[i:j].strip() for i,j in
            zip([0]+cut_positions,cut_positions+[None])]

    #extend the description if provided following lines
    if len(item_lines) > 1:
        following_lines = item_lines[1:]
        desc_start = cut_positions[-1]
        following_description = ' '.join(
                [e[desc_start:].strip() for e in following_lines])
        description = ' '.join((description, following_description))

    #convert start and end points to int, is possible
    from_point, to_point = map(try_int, (from_point, to_point))
    return keyname, from_point, to_point, description.strip(' .')

def try_int(obj):
    """return int(obj), or original obj if failed"""
    try:
        return int(obj)
    except ValueError: #invalid literal for int()
        return obj


### ft description_parsers below

def ft_id_parser(description):
    """return a dict of {'Description':,'Id':} from raw decription str
    
    Examples.
    FT   PROPEP       25     48
    FT                                /FTId=PRO_0000021449.
    FT   VARIANT     214    214       V -> I.
    FT                                /FTId=VAR_009122.
    FT   VARSPLIC     33     83       TVGRFRRRATP -> PLTSFHPFTSQMPP (in
    FT                                isoform 2).
    FT                                /FTId=VSP_004370.
    """
    fieldnames = ['Description', 'Id']
    id_sep='/FTId='  
    try:
        desc, id = [i.strip(' .') for i in description.split(id_sep)]
    except:
        desc, id = description, ''

    #replace desc in fields with (desc, id) to get the result
    result = dict(zip(fieldnames, [desc, id]))
    return result  

def ft_mutation_parser(description, mutation_comment_delimiter='('):
    """return a  dict of {'MutateFrom': , 'MutateTo':,'Comment':} from 
    description str 

    Warning: if both id and mutation should be parsed, always parse id first.
    
    Note: will add exceptions later

    Examples.
    FT   VARIANT     214    214       V -> I (in a lung cancer).
    FT                                /FTId=VAR_009122.
    FT   CONFLICT    484    484       Missing (in Ref. 2).
    FT   CONFLICT    802    802       K -> Q (in Ref. 4, 5 and 10).
    """
    fieldnames = 'MutateFrom MutateTo Comment'.split()
    
    #split desc into mutation and comment
    desc = description.rstrip(' )')
    try:
        mutation, comment = desc.split(mutation_comment_delimiter, 1)
    except ValueError, e:  #too many values to unpack
        mutation, comment = desc, ''

    #split mutation into mut_from, mut_to
    #if mut_from/to unknown, the mutation message will be in mut_from
    mutation_delimiter = '->'
    try:
        mut_from, mut_to=map(strip, mutation.split(mutation_delimiter, 1))
    except ValueError, e:  #too many values to unpack
        mut_from, mut_to = mutation, ''

    #replace desc in fields with mut_from, mut_to and comment to get the result
    result = dict(zip(fieldnames, [mut_from, mut_to, comment]))
    return result

def ft_mutagen_parser(description):
    """return a dict from MUTAGEN description

    MUTAGEN - Site which has been experimentally altered.  Examples

    FT   MUTAGEN     119    119       C->R,E,A: Loss of cADPr hydrolase and
    FT                                ADP-ribosyl cyclase activity.
    FT   MUTAGEN     169    177       Missing: Abolishes ATP-binding.
    """
    return ft_mutation_parser(description, mutation_comment_delimiter=':')

def ft_id_mutation_parser(description):
    """return a dict from description str
    
    Examples.
    FT   VARIANT     214    214       V -> I.
    FT                                /FTId=VAR_009122.
    FT   VARSPLIC     33     83       TVGRFRRRATP -> PLTSFHPFTSQMPP (in
    FT                                isoform 2).
    FT                                /FTId=VSP_004370.
    """
    desc_id_dict = ft_id_parser(description)
    desc = desc_id_dict.pop('Description')
    result = dict(desc_id_dict, **ft_mutation_parser(desc))
    return result
    
ft_description_parsers = {
    'VARIANT': ft_id_mutation_parser,
    'VARSPLIC': ft_id_mutation_parser,
    'CARBOHYD':ft_id_parser,
    'CHAIN': ft_id_parser,
    'PEPTIDE':ft_id_parser,
    'PROPEP': ft_id_parser,
    'CONFLICT': ft_mutation_parser,
    'MUTAGEN': ft_mutagen_parser,
    #'NON_TER': ft_choplast_parser, 
    #'NON_CONS': ft_choplast_parser,
    
}


#################################
# cc_parser
all_cc_topics = dict.fromkeys([
    'ALLERGEN', 'ALTERNATIVE PRODUCTS', 'BIOPHYSICOCHEMICAL PROPERTIES',
    'BIOTECHNOLOGY', 'CATALYTIC ACTIVITY', 'CAUTION', 'COFACTOR', 'DATABASE',
    'DEVELOPMENTAL STAGE', 'DISEASE', 'DOMAIN', 'ENZYME REGULATION',
    'FUNCTION', 'INDUCTION', 'INTERACTION', 'MASS SPECTROMETRY',
    'MISCELLANEOUS', 'PATHWAY', 'PHARMACEUTICAL', 'POLYMORPHISM', 'PTM',
    'RNA EDITING', 'SIMILARITY', 'SUBCELLULAR LOCATION', 'SUBUNIT',
    'TISSUE SPECIFICITY', 'TOXIC DOSE'])
    
def cc_parser(lines, strict=False):
    """return a dict of {topic: a list of values} from CC lines.
    
    some topics have special format and will use specific parsers defined in
    handlers

    The CC lines are free text comments on the entry, and are used to convey
    any useful information. The comments always appear below the last reference
    line and are grouped together in comment blocks; a block is made up of 1 or
    more comment lines. The first line of a block starts with the characters
    '-!-'.  Format:

    CC   -!- TOPIC: First line of a comment block;
    CC       second and subsequent lines of a comment block.

    Examples:
    CC   -!- DISEASE: Defects in PHKA1 are linked to X-linked muscle
    CC       glycogenosis [MIM:311870]. It is a disease characterized by slowly
    CC       progressive, predominantly distal muscle weakness and atrophy.
    CC   -!- DATABASE: NAME=Alzheimer Research Forum; NOTE=APP mutations;
    CC       WWW="http://www.alzforum.org/res/com/mut/app/default.asp".
    CC   -!- INTERACTION:
    CC       Self; NbExp=1; IntAct=EBI-123485, EBI-123485;
    CC       Q9W158:CG4612; NbExp=1; IntAct=EBI-123485, EBI-89895;
    CC       Q9VYI0:fne; NbExp=1; IntAct=EBI-123485, EBI-126770;
    CC   -!- BIOPHYSICOCHEMICAL PROPERTIES:
    CC       Kinetic parameters:
    CC         KM=98 uM for ATP;
    CC         KM=688 uM for pyridoxal;
    CC         Vmax=1.604 mmol/min/mg enzyme;
    CC       pH dependence:
    CC         Optimum pH is 6.0. Active from pH 4.5 to 10.5;
    CC   -!- ALTERNATIVE PRODUCTS:
    CC       Event=Alternative splicing; Named isoforms=3;
    CC         Comment=Additional isoforms seem to exist. Experimental
    CC         confirmation may be lacking for some isoforms;
    CC       Name=1; Synonyms=AIRE-1;
    CC         IsoId=O43918-1; Sequence=Displayed;
    CC       Name=2; Synonyms=AIRE-2;
    CC         IsoId=O43918-2; Sequence=VSP_004089;
    CC       Name=3; Synonyms=AIRE-3;
    CC         IsoId=O43918-3; Sequence=VSP_004089, VSP_004090;
    CC   --------------------------------------------------------------------------
    CC   This SWISS-PROT entry is copyright. It is produced  a collaboration
    CC   removed.
    CC   --------------------------------------------------------------------------
    """
    lines = labeloff(lines) 
    #cc_itemfinder yield each topic block
    #cc_basic_itemparser split a topic block into (topic_name, content_as_list)
    topic_contents = map(cc_basic_itemparser, cc_itemfinder(lines))
    #content of a topic further parsed using a content_parser decided by the
    #topic name.  result is grouped into a dict.
    try:
        result = pairs_to_dict(topic_contents, 'always_multi_value', 
            handlers=cc_content_parsers, default_handler=join_parser)
    except Exception, e:
        pprint( lines)
        raise e

    if strict:
        for topic in result:
            if topic not in all_cc_topics:
                raise FieldError('Invalid topic: %s' % topic)

    return result

def cc_basic_itemparser(topic):
    """return (topic_name, topic_content as a list) from a cc topic block. 
    
    Format of a topic as input of this function: [ 
    '-!- TOPIC: First line of a comment block;',
    '    second and subsequent lines of a comment block.']
    """
    num_format_leading_spaces = 4  #for topic lines except the first

    #get the keyname and content_head from the first line
    topic_head = topic[0].lstrip(' -!')
    try:
        keyname, content_head = map(strip, topic_head.split(':', 1))
    except ValueError: # need more than 1 value to unpack
        raise FieldError('Not a valid topic line: %s', topic[0]) 

    if content_head:
        content = [content_head]
    else:
        content = []

    #the following lines be stripped off the format leading spaces
    if len(topic) > 1:
        content += labeloff(topic[1:], num_format_leading_spaces)

    return keyname, content

def cc_itemfinder(lines):
    """yield each topic/license as a list from CC lines without label and
    leading spaces.
    
    Warning: hardcoded LICENSE handling"""

    ## all the codes except the return line  tries to preprocess the 
    #license block

    #two clusters of '-' are used as borders for license, as observed
    license_border = '-' * 74
    license_headstr = '-!- LICENSE:'
    content_start = 4  #the idx where topic content starts

    if license_border in lines:
        #discard the bottom license border
        if lines[-1] == license_border:
            lines.pop()
        else:
            raise FieldError('No bottom line for license: %s' % lines)

        #normalize license lines to the format of topic lines
        license_idx = lines.index(license_border)
        lines[license_idx] = license_headstr 
        for i in range(license_idx+1, len(lines)):
            lines[i] = ' ' * content_start + lines[i]

    #the return line is all we need, if no license block
    return hanging_paragraph_finder(lines)


## cc_content_parsers here below

def cc_interaction_parser(content_list):
    """return a list of [interactor, {params}] from interaction content.

    Format:
    -!- INTERACTION:
        {{SP_Ac:identifier[ (xeno)]}|Self}; NbExp=n; IntAct=IntAct_Protein_Ac,
        IntAct_Protein_Ac;
    """
    result= []
    for line in content_list:
        interactor, params = line.split(';',1)
        params = join_split_dict_parser([params])
        result.append((interactor.strip(), params))
    return result
        
cc_alternative_products_event_finder = LabeledRecordFinder(
        lambda x: x.startswith('Event='))
cc_alternative_products_name_finder = LabeledRecordFinder(
        lambda x: x.startswith('Name='))
def cc_alternative_products_parser(content_list):
    """return a list from AlternativeProucts lines.

    Note: not complete parsing, consider to merge Names to the last Event??
    and make event or name to be the dict key?

    Format:
    CC   -!- ALTERNATIVE PRODUCTS:
    CC       Event=Alternative promoter;
    CC         Comment=Free text;
    CC       Event=Alternative splicing; Named isoforms=n;
    CC         Comment=Optional free text;
    CC       Name=Isoform_1; Synonyms=Synonym_1[, Synonym_n];
    CC         IsoId=Isoform_identifier_1[, Isoform_identifier_n]; Sequence=Displayed;
    CC         Note=Free text;
    CC       Name=Isoform_n; Synonyms=Synonym_1[, Synonym_n];
    CC         IsoId=Isoform_identifier_1[, Isoform_identifier_n]; Sequence=VSP_identifier_1 [, VSP_identifier_n];
    CC         Note=Free text;
    CC       Event=Alternative initiation;
    CC         Comment=Free text;
    """
    result = []
    for event in cc_alternative_products_event_finder(content_list):
        head_names = list(cc_alternative_products_name_finder(event))
        head, names = head_names[0], head_names[1:]
        event_dict = join_split_dict_parser(head)
        if names:
            event_dict['Names'] = map(join_split_dict_parser, names)
        result.append(event_dict)
    return result 

def cc_biophysicochemical_properties_parser(content):
    """return a dict from content_list of a ~ topic.
    
    Format of a ~ topic block:
    CC   -!- BIOPHYSICOCHEMICAL PROPERTIES:
    CC       Absorption:
    CC         Abs(max)=xx nm;
    CC         Note=free_text;
    CC       Kinetic parameters:
    CC         KM=xx unit for substrate [(free_text)];
    CC         Vmax=xx unit enzyme [free_text];
    CC         Note=free_text;
    CC       pH dependence:
    CC         free_text;
    CC       Redox potential:
    CC         free_text;
    CC       Temperature dependence:
    CC         free_text;

    Example of a ~ topic block:
    CC   -!- BIOPHYSICOCHEMICAL PROPERTIES:
    CC       Kinetic parameters:
    CC         KM=98 uM for ATP;
    CC         KM=688 uM for pyridoxal;
    CC         Vmax=1.604 mmol/min/mg enzyme;
    CC       pH dependence:
    CC         Optimum pH is 6.0. Active from pH 4.5 to 10.5;
    """
    def get_sub_key_content(sub_topic):
        """return (sub_key, sub_content as parsed) from lines of a sub_topic"""
        sub_key = sub_topic[0].rstrip(': ')
        sub_content = map(strip, sub_topic[1:]) #strip the two leading spaces

        #further process the content here
        if sub_key in ['Kinetic parameters',  'Absorption']:
            #group into a dict which allow multiple values.
            subkey_values = join_split_parser(sub_content,
                    delimiters=[';',('=',1)])
            sub_content = pairs_to_dict(subkey_values, 'allow_multi_value')
        else:
            sub_content = join_parser(sub_content, chars_to_strip='; ')
        return sub_key, sub_content

    sub_key_contents = map(get_sub_key_content,
            hanging_paragraph_finder(content))
    return pairs_to_dict(sub_key_contents, 'no_duplicated_key')
    
cc_content_parsers = {
    #? not complete: further group alternative splicing?
    'ALTERNATIVE PRODUCTS': cc_alternative_products_parser, 
    'BIOPHYSICOCHEMICAL PROPERTIES': cc_biophysicochemical_properties_parser,
    'INTERACTION': cc_interaction_parser, 
    'DATABASE': join_split_dict_parser, 
    'MASS SPECTROMETRY':join_split_dict_parser, 
}


#################################
# REFs parser
def refs_parser(lines):
    """return a dict of {RN: single_ref_dict}
    
    These lines comprise the literature citations. The citations indicate the
    sources from which the data has been abstracted.  if several references are
    given, there will be a reference block for each.
    """
    rn_ref_pairs = map(single_ref_parser, ref_finder(lines))
    return pairs_to_dict(rn_ref_pairs)

is_ref_line = lambda x: x.startswith('RN')
ref_finder = LabeledRecordFinder(is_ref_line)

required_ref_labels = 'RN RP RL RA/RG RL'.split()
def single_ref_parser(lines, strict=False):
    """return rn, ref_dict from lines of a single reference block

    strict: if True (default False), raise RecordError if lacking required
    labels.

    Warning: using global required_ref_labels.

    The reference lines for a given citation occur in a block, and are always
    in the order RN, RP, RC, RX, RG, RA, RT and RL. Within each such reference
    block, the RN line occurs once, the RC, RX and RT lines occur zero or more
    times, and the RP, RG/RA and RL lines occur one or more times.
    """
    #group by linecode
    label_lines = map(linecode_maker, lines)
    raw_dict = pairs_to_dict(label_lines, 'always_multi_value')

    if strict:
        labels = dict.fromkeys(raw_dict.keys())
        if 'RA' in labels or 'RG' in labels:
            labels['RA/RG'] = True
        for rlabel in required_ref_labels:
            if rlabel not in labels:
                raise RecordError('The reference block lacks required label: '\
                    '%s' % rlabel)

    #parse each field with relevant parser
    parsed_dict = pairs_to_dict(raw_dict.items(), handlers=ref_parsers)
    rn = parsed_dict.pop('RN')
    
    return rn, parsed_dict


## ref_parsers here below

def rx_parser(lines):
    """return a dict from RX lines.

    The RX (Reference cross-reference) line is an optional line which is used
    to indicate the identifier assigned to a specific reference in a
    bibliographic database. The format:
    RX   Bibliographic_db=IDENTIFIER[; Bibliographic_db=IDENTIFIER...];

    Where the valid bibliographic database names and their associated
    identifiers are:
       MEDLINE    Eight-digit MEDLINE Unique Identifier (UI)
       PubMed    PubMed Unique Identifier (PMID)
       DOI  Digital Object Identifier (DOI), examples:
           DOI=10.2345/S1384107697000225
           DOI=10.4567/0361-9230(1997)42:<OaEoSR>2.0.TX;2-B
           http://www.doi.org/handbook_2000/enumeration.html#2.2
    """
    lines = labeloff(lines) 
    return join_split_dict_parser(lines, delimiters=['; ','='])

def rc_parser(lines):
    """return a dict from RC lines.

    The RC (Reference Comment) lines are optional lines which are used to store
    comments relevant to the reference cited. The format:
    RC   TOKEN1=Text; TOKEN2=Text; ...

    The currently defined tokens and their order in the RC line are:
    STRAIN TISSUE TRANSPOSON PLASMID
    """
    lines = labeloff(lines) 
    return join_split_dict_parser(lines)

def rg_parser(lines):
    """return a list of str(group names) from RG lines

    The Reference Group (RG) line lists the consortium name associated with a
    given citation. The RG line is mainly used in submission reference blocks,
    but can also be used in paper references, if the working group is cited as
    an author in the paper. RG line and RA line (Reference Author) can be
    present in the same reference block; at least one RG or RA line is
    mandatory per reference block. example:
    RG   The mouse genome sequencing consortium;
    """
    lines = labeloff(lines) 
    return join_split_parser(lines)

def ra_parser(lines):
    """return a list from RA lines.

    The RA (Reference Author) lines list the authors of the paper (or other
    work) cited. RA might be missing in references that cite a reference group
    (see RG line). At least one RG or RA line is mandatory per reference block.

    All of the authors are included, and are listed in the order given in the
    paper. The names are listed surname first followed by a blank, followed by
    initial(s) with periods. The authors' names are separated by commas and
    terminated by a semicolon. Author names are not split between lines. eg: 
    RA   Galinier A., Bleicher F., Negre D., Perriere G., Duclos B.;

    All initials of the author names are indicated and hyphens between initials
    are kept.  An author's initials can be followed by an abbreviation such as
    'Jr' (for Junior), 'Sr' (Senior), 'II', 'III' or 'IV'.  Example:
    RA   Nasoff M.S., Baker H.V. II, Wolf R.E. Jr.;
    """
    lines = labeloff(lines) 
    return join_split_parser(lines, chars_to_strip=';', delimiters=',')

def rp_parser(lines):
    """return joined str stripped of '.'.

    The RP (Reference Position) lines describe the extent of the work relevant 
    to the entry carried out by the authors. format:
    RP   COMMENT.
    """
    lines = labeloff(lines) 
    return ' '.join(lines).strip('. ')

def rl_parser(lines):
    """return joined str stipped of '.'.

    Note: not complete parsing.

    The RL (Reference Location) lines contain the conventional citation 
    information for the reference. In general, the RL lines alone are 
    sufficient to find the paper in question.

    a) Journal citations
    RL   Journal_abbrev Volume:First_page-Last_page(YYYY).
    When a reference is made to a paper which is 'in press'
    RL   Int. J. Parasitol. 0:0-0(2005).

    b) Electronic publications
    includes an '(er)' prefix. The format is indicated below:
    RL   (er) Free text.

    c) Book citations
    RL   (In) Editor_1 I.[, Editor_2 I., Editor_X I.] (eds.);
    RL   Book_name, pp.[Volume:]First_page-Last_page, Publisher, City (YYYY).
    Examples:
    RL   (In) Rich D.H., Gross E. (eds.);
    RL   Proceedings symposium, pp.69-72, Pierce
    RL   Chemical Co., Rockford Il. (1981).

    d) Unpublished results, eg:
    RL   Unpublished results, cited by:
    RL   Shelnutt J.A., Rousseau D.L., Dethmers J.K., Margoliash E.;
    RL   Biochemistry 20:6485-6497(1981).

    e) Unpublished observations, format:
    RL   Unpublished observations (MMM-YYYY).

    f) Thesis, format:
    RL   Thesis (Year), Institution_name, Country.

    g) Patent applications, format:
    RL   Patent number Pat_num, DD-MMM-YYYY.

    h) Submissions, format:
    RL   Submitted (MMM-YYYY) to Database_name.
    'Database_name' is one of the following:
    EMBL/GenBank/DDBJ, Swiss-Prot, PDB, PIR.
    """
    lines = labeloff(lines) 
    return ' '.join(lines).strip('. ')

def rt_parser(lines):
    """return joined line stripped of .";
    
    The RT (Reference Title) lines give the title of the paper (or other work)
    cited as exactly as possible given the limitations of the computer 
    character set. The format of the RT line is:
    RT   "Title.";"""
    lines = labeloff(lines) 
    return ' '.join(lines).strip('.";')


def rn_parser(lines):
    """return a integer from RN lines (only one line).

    The RN (Reference Number) line gives a sequential number to each reference
    citation in an entry. This number is used to indicate the reference in 
    comments and feature table notes. The format of the RN line is:
    RN   [##]
    """
    lines = labeloff(lines) 
    return int(lines[0].strip(' []'))

ref_parsers = {
    'RN': rn_parser,
    'RP': rp_parser,
    'RC': rc_parser,
    'RX': rx_parser,
    'RG': rg_parser,
    'RA': ra_parser,
    'RT': rt_parser,
    'RL': rl_parser,
}

required_labels = 'ID AC DT DE OS OC OX SQ REF'.split() + ['']
#################################
# Minimal Ebi parser
def MinimalEbiParser(lines, strict=True, selected_labels=[]):
    """yield each (sequence as a str, a dict of header) from ebi record lines

    if strict (default), raise RecordError if a record lacks required labels.

    Warning: using the global required_labels.

    Line code   Content     Occurrence in an entry
    ID  Identification  Once; starts the entry
    AC  Accession number(s) Once or more
    DT  Date    Three times
    DE  Description Once or more
    GN  Gene name(s)    Optional
    OS  Organism species    Once
    OG  Organelle   Optional
    OC  Organism classification Once or more
    OX  Taxonomy cross-reference    Once
    RN  Reference number    Once or more
    RP  Reference position  Once or more
    RC  Reference comment(s)    Optional
    RX  Reference cross-reference(s)    Optional
    RG  Reference group Once or more (Optional if RA line)
    RA  Reference authors   Once or more (Optional if RG line)
    RT  Reference title Optional
    RL  Reference location  Once or more
    CC  Comments or notes   Optional
    DR  Database cross-references   Optional
    KW  Keywords    Optional
    FT  Feature table data  Optional
    SQ  Sequence header Once
    (blanks)    Sequence data   Once or more
    //  Termination line    Once; ends the entry

    The two-character line-type code that begins each line is always followed
    by three blanks, so that the actual information begins with the sixth
    character. Information is not extended beyond character position 75 except
    for one exception: CC lines that contain the 'DATABASE' topic"""
    for record in EbiFinder(lines):
        if strict and not record[0].startswith('ID'): 
            raise RecordError('Record must begin with ID line')
        del record[-1] #which must be //, ensured by Finder

        keyvalues = map(linecode_merging_maker, record)
        raw_dict = pairs_to_dict(keyvalues, 'always_multi_value',
                all_keys=_parsers)

        if strict:
            for rlabel in required_labels:
                if rlabel not in raw_dict:
                    raise RecordError('The record lacks required label: '\
                        '%s' % rlabel)

        sequence = raw_dict.pop('')  #which is the linecode for sequence
        sequence = ''.join(sequence).translate(all_chars,'\t\n ')
        
        if selected_labels:
            for key in raw_dict.keys():
                if key not in selected_labels:
                    del raw_dict[key]
        
        header_dict = raw_dict 
        yield sequence, header_dict

def linecode_merging_maker(line):
    """return merged linecode and the line.
    
    All valid reference linecodes merged into REF
    
    Warning: using global ref_parsers"""
    linecode = linecode_maker(line)[0] 
    if linecode in ref_parsers:
        linecode = 'REF'
    return linecode, line

#################################
# EbiParser
def parse_header(header_dict, strict=True):
    """Parses a dictionary of header lines"""
    return pairs_to_dict(header_dict.items(), 'no_duplicated_key',
            handlers = _parsers)
 
_parsers = {
    'ID': id_parser,
    'AC': ac_parser,
    'DE': de_parser,
    'DT': dt_parser,
    'GN': gn_parser,
    'OC': oc_parser, 
    'OS': os_parser, 
    'OX': ox_parser, 
    'OG': og_parser, 
    'REF': refs_parser,
    'CC': cc_parser,
    'DR': dr_parser,
    'KW': kw_parser,
    'FT': ft_parser,
    'SQ': sq_parser,
    '': None,
    '//': None,
}

def EbiParser(lines, seq_constructor=Sequence, 
        header_constructor= parse_header, strict=True, selected_labels=[]):
    """Parser for the EBI data format.

    lines: input data (list of lines or file stream)
    seq_constructor: constructor function to construct sequence, 'Sequence' by 
        default.
    header_constructor: function to process the header information. Default
        is 'parse_header'
    strict: whether an exception should be raised in case of a problem
        (strict=True) or whether the bad record should be skipped
        (strict=False).
    selected_labels: Labels from the original data format that you want 
        returned. All the original header labels are used, except for 
        REFERENCES, which is 'REF'.
    """
    for sequence, header_dict in MinimalEbiParser(lines, strict=strict,\
        selected_labels=selected_labels):
        if seq_constructor:
            sequence = seq_constructor(sequence)
        try:
            header = header_constructor(header_dict, strict=strict)
        except (RecordError, FieldError, ValueError), e:
            if strict:
                #!! just raise is better than raise RecordError
                raise #RecordError, str(e)
            else:
                continue
        
        yield sequence, header


if __name__ == "__main__":
    from getopt import getopt, GetoptError
    usage = """ Usage: python __.py [options] [source]

Options:
  -h, --help              show this help
  -d                      show debugging information while parsing

Examples:
"""
    try:
        opts, args = getopt(sys.argv[1:], "hd", ["help"])
    except GetoptError:
        print usage; sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print usage; sys.exit()
    if args:
        lines = file(args[0])
        print 'Parsing the file'
        for i, rec in enumerate(EbiParser(lines, strict=True)):
            print '\r %s: %s' % (i,rec[1]['ID']['EntryName']) ,
    else: 
        lines="""\
ID   Q9U9C5_CAEEL   PRELIMINARY;      PRT;   218 AA. 
AC   Q9U9C5;hdfksfsdfs;sdfsfsfs;
DT   01-MAY-2000 (TrEMBLrel. 13, Created)
DT   01-MAY-2000 (TrEMBLrel. 13, Last sequence update)
DT   13-SEP-2005 (TrEMBLrel. 31, Last annotation update)
DE   Basic salivary proline-rich protein 4 allele L (Salivary proline-rich 
DE   protein Po) (Parotid o protein) [Contains: Peptide P-D (aa); BB (bb) 
DE   (bbb)] (Fragment).
GN   Name=nob-1; ORFNames=Y75B8A.2, Y75B8A.2B;
GN   and
GN   Name=Jon99Ciii; Synonyms=SER2, SER5, Ser99Db; ORFNames=CG15519;
OS   Caenorhabditis elegans (aa) (bb).
OC   Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea;
OC   Rhabditidae; Peloderinae; Caenorhabditis.
OG   Plastid; Apicoplast.
OG   Plasmid R6-5, Plasmid IncFII R100 (NR1), and
OG   Plasmid IncFII R1-19 (R1 drd-19).
OX   NCBI_TaxID=6239;
RN   [1]
RP   NUCLEOTIDE SEQUENCE.
RC   STRAIN=N2;
RX   MEDLINE=20243724; PubMed=10781051; DOI=10.1073/pnas.97.9.4499;
RA   Van Auken K., Weaver D.C., Edgar L.G., Wood W.B.;
RT   "Caenorhabditis elegans embryonic axial patterning requires two
RT   recently discovered posterior-group Hox genes.";
RL   Proc. Natl. Acad. Sci. U.S.A. 97:4499-4503(2000).
RN   [2]
RP   NUCLEOTIDE SEQUENCE.
RC   STRAIN=N2;
RG   The mouse genome sequencing consortium;
RL   Submitted (JUL-1999) to the EMBL/GenBank/DDBJ databases.
CC   -!- SUBCELLULAR LOCATION: Nuclear (By similarity).
CC   -!- DATABASE: NAME=slkdfjAtlas Genet. Cytogenet. Oncol. Haematol.;
CC       WWW="http://www.infobiogen.fr/services/chromcancer/Genes/
CC   -!- DATABASE: NAME=Atlas Genet. Cytogenet. Oncol. Haematol.;
CC       WWW="http://www.infobiogen.fr/services/chromcancer/Genes/
CC       P53ID88.html".
CC   -!- INTERACTION:
CC       P51617:IRAK1; NbExp=1; IntAct=EBI-448466, EBI-358664;
CC       P51617:IRAK1; NbExp=1; IntAct=EBI-448472, EBI-358664;
CC   -!- ALTERNATIVE PRODUCTS:
CC       Event=Alternative splicing; Named isoforms=3;
CC         Comment=Additional isoforms seem to exist. Experimental
CC         confirmation may be lacking for some isoforms;
CC       Name=1; Synonyms=AIRE-1;
CC         IsoId=O43918-1; Sequence=Displayed;
CC       Name=2; Synonyms=AIRE-2;
CC         IsoId=O43918-2; Sequence=VSP_004089;
CC       Name=3; Synonyms=AIRE-3;
CC         IsoId=O43918-3; Sequence=VSP_004089, VSP_004090;
CC   -!- BIOPHYSICOCHEMICAL PROPERTIES:
CC       Kinetic parameters:
CC         KM=98 uM for ATP;
CC         KM=688 uM for pyridoxal;
CC         Vmax=1.604 mmol/min/mg enzyme;
CC       pH dependence:
CC         Optimum pH is 6.0. Active from pH 4.5 to 10.5;
CC   -!- MASS SPECTROMETRY: MW=13822; METHOD=MALDI; RANGE=19-140 (P15522-
CC       2); NOTE=Ref.1.
CC   --------------------------------------------------------------------------
CC   This SWISS-PROT entry is copyright. It is produced through a collaboration
CC   removed.
CC   --------------------------------------------------------------------------
DR   EMBL; AF172090; AAD48874.1; -; mRNA.
DR   EMBL; AL033514; CAC70124.1; -; Genomic_DNA.
DR   HSSP; P02833; 9ANT.
KW   Complete proteome; DNA-binding; Developmental protein; Homeobox;
KW   Hypothetical protein; Nuclear protein.
FT   DNA_BIND    >102    292
FT   REGION        1     44       Transcription activation (acidic).
FT   CHAIN        23    611       Halfway protein.
FT                                /FTId=PRO_0000021413.
FT   VARIANT       1      7       unknown  (in a skin tumor).
FT                                /FTId=VAR_005851.
FT   VARIANT       7      7       D -> H (in a skin tumor).
FT                                /FTId=VAR_005851.
FT   CONFLICT    282    282       R -> Q (in Ref. 18).
FT   STRAND      103    103
FT   NON_TER     80     80        non_ter.
SQ   SEQUENCE   218 AA;  24367 MW;  F24AE5E8A102FAC6 CRC64;
     MISVMQQMIN NDSPEDSKES ITSVQQTPFF WPSAAAAIPS IQGESRSERE SETGSSPQLA
     PSSTGMVMPG TAGMYGFGPS RMPTANEFGM MMNPVYTDFY QNPLASTDIT IPTTAGSSAA
     TTPNAAMHLP WAISHDGKKK RQPYKKDQIS RLEYEYSVNQ YLTNKRRSEL SAQLMLDEKQ
     VKVWFQNRRM KDKKLRQRHS GPFPHGAPVT PCIERLIN
//
ID   Q9U9C5_TEST   PRELIMINARY;      PRT;   218 AA. 
DT   ddd.
AC   Q9U9C5;hdfksfsdfs;sdfsfsfs;
SQ   SEQUENCE   218 AA;  24367 MW;  F24AE5E8A102FAC6 CRC64;
     MISVMQQMIN NDSPEDSKES ITSVQQTPFF WPSAAAAIPS IQGESRSERE
//
""".split('\n')
        pprint(list(EbiParser(lines,strict=False, selected_labels=[])))


    #from time import time
    ##sys.exit()
    #if len(sys.argv) > 1:
    #    #f = file('/home/zongzhi/Projects/SNP/working/data/uniprot_sprot_human.dat')
    #    f = file('/home/zongzhi/Projects/SNP/working/data/uniprot_sprot_fungi.dat')
    #    #f = file('/home/zongzhi/Projects/SNP/snp_tests/ebi_test.txt')

    #    i = 0
    #    for sequence, head in MinimalEbiParser(f):
    #        i += 1
    #        if i>10000: sys.exit()
    #        print '%s \r' % i, 
    #        try:
    #            de = ' '.join(head['OG'])
    #        except KeyError, e:
    #            pass
    #            #print e 
    #        else:
    #            if 'Plasmid' in de:
    #                print de, '\n'