/usr/share/acl2-8.0dfsg/serialize-raw.lisp is in acl2-source 8.0dfsg-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 | ; ACL2 Version 8.0 -- A Computational Logic for Applicative Common Lisp
; Copyright (C) 2017, Regents of the University of Texas
; This version of ACL2 is a descendent of ACL2 Version 1.9, Copyright
; (C) 1997 Computational Logic, Inc. See the documentation topic NOTE-2-0.
; This program is free software; you can redistribute it and/or modify
; it under the terms of the LICENSE file distributed with ACL2.
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; LICENSE for more details.
; Regarding authorship of ACL2 in general:
; Written by: Matt Kaufmann and J Strother Moore
; email: Kaufmann@cs.utexas.edu and Moore@cs.utexas.edu
; Department of Computer Science
; University of Texas at Austin
; Austin, TX 78712 U.S.A.
; serialize-raw.lisp -- a scheme for serializing ACL2 objects to disk.
; This file was initially developed and contributed by Jared Davis on behalf of
; Centaur Technology. Comments referring to "I" or "my" are from Jared. This
; file is now maintained by the ACL2 authors (see above).
(in-package "ACL2")
; INTRODUCTION
;
; We now develop a serialization scheme that allows ACL2 objects to be saved to
; disk using a compact, structure-shared, binary encoding.
;
; We configure ACL2's print-object$ function so that it writes objects with our
; encoding scheme when writing certificate files. This allows large objects
; produced by make-event to be read and written efficiently.
;
; We extend the ACL2 readtable so that serialized objects can be read at any
; time, using the extended reader macros #Y[...] and #Z[...]. These macros are
; almost identical, but
;
; #Y rebuilds the object entirely with CONS and does not restore any
; fast alists, whereas
;
; #Z uses HONS for the parts of the structure that were originally normed,
; and rebuilds the hash tables for fast alists.
;
; We provide routines for reading and writing ACL2 objects as individual files,
; typically with a ".sao" extension for "Serialized ACL2 Object". For
; bootstrapping reasons, these are introduced in hons.lisp and hons-raw.lisp
; instead of here in serialize-raw.lisp.
; Essay on Bad Objects and Serialize
;
; When we decode serialized objects, must we ensure that the object returned is
; a valid ACL2 object, i.e., not something that BAD-LISP-OBJECTP would reject?
;
; Matt and Jared think the answer is "no" for the reader macros. Why? These
; macros are just extensions of certain readtables like the *acl2-readtable*,
; which are used by READ to interpret input. But there are already any number
; of other ways for READ to produce bad objects, for instance it might produce a
; floating point numbers, symbols in a foreign package, vectors, structures,
; etc. The "Remarks on *acl2-readtable*" in acl2.lisp has more discussion of
; these matters. At any rate, whenever ACL2 is using READ, it already needs to
; be defending against bad objects, so it should be okay if the serialize reader
; macros generate bad objects.
;
; However, Jared thinks that the serialize-read-fn function *is* responsible for
; ensuring that only good objects are read, because it is a "new" way for input
; to enter the system.
;
; Well, the serialized object format is considerably more restrictive than the
; Common Lisp reader, and does not provide any way to encode floats, circular
; objects, etc. Jared thinks the only bad objects that can be produced from
; serialized reading are symbols in unknown packages. So, in
; serialize-read-file we pass in a flag that makes sure we check whether
; packages are known. We think this is sufficient to justify not checking
; BAD-LISP-OBJECTP explicitly.
; Essay on the Serialize Object Format
;
; Our scheme involves encoding ACL2 objects into a fairly simple, byte-based,
; binary format.
;
; There are actually three different formats of serialized objects, named V1,
; V2 and V3. For compatibility with previously written files, we support
; reading from all three file formats. But we always write files using the
; newest, V3 format.
;
; Why these different versions?
;
; V1. We originally developed our serialization scheme as a ttag-based
; library in :dir :system, but this left us with no way to tightly
; integrate it into ACL2's certificate printing/reading routines.
;
; V2. When we moved serialization into ACL2 proper, we noticed a few things
; that we could improve, and tweaked the serialization scheme. The new
; scheme, V2, wasn't compatible with community book books/serialize, but
; we already had many files written in the old V1 format.
;
; V3. Later, we realized that it would be easy to restore fast alists within
; serialized objects, and that this would allow the fast alists defined
; in a book to still be fast after including the book. The new format,
; V3, added this feature, but again we had lots of V2 files that we
; still wanted to be able to read.
;
; Eventually we should be able to drop support for the old versions, but the
; formats are all very similar so supporting them is not that much work. Since
; all of the versions are very similar, we begin with the V1 format. A
; sequence of objects is encoded by OBJECT: first MAGIC, then LEN, then the
; indicated homogeneous sequences of objects, and then MAGIC.
;
; OBJECT ::= MAGIC ; marker for sanity checking
; LEN ; total number of objects
; NATS RATS COMPLEXES CHARS ; object data
; STRS PACKAGES CONSES ;
; MAGIC ; marker for sanity checking
;
; NATS ::= LEN NAT* ; number of nats, followed by that many nats
; RATS ::= LEN RAT* ; number of rats, followed by that many rats
; COMPLEXES ::= LEN COMPLEX* ; etc.
; CHARS ::= LEN CHAR*
; STRS ::= LEN STR*
; PACKAGES ::= LEN PACKAGE*
; CONSES ::= LEN CONS*
;
; RAT ::= NAT NAT NAT ; sign (0 or 1), numerator, denominator
; COMPLEX ::= RAT RAT ; real, imaginary parts
; CHAR ::= byte ; the character code for this character
; STR ::= LEN CHAR* ; length and then its characters
; PACKAGE ::= STR LEN STR* ; package name, number of symbols, symbol names
; CONS ::= NAT NAT ; "index" of its car and cdr (see below)
;
; LEN ::= NAT ; just to show when we're referring to a length
;
; MAGIC ::= #xAC120BC7 ; also see the discussion below
;
; NAT ::= [see below]
;
; You can experiment with these formats, for example as follows in raw Lisp.
;
; (let ((str (with-output-to-string (s) (ser-encode-to-stream 5/2 s))))
; (loop for i from 0 to (1- (length str))
; collect (char-code (char str i))))
;
;
; Magic Numbers. The magic number, #xAC120BC7, is a 32-bit integer that sort
; of looks like "ACL2 OBCT", i.e., "ACL2 Object." This use of magic numbers is
; probably silly, but may have some advantages:
;
; (1) It lets us do a basic sanity check.
;
; (2) When serialize is used to write out whole files, helps to ensure the
; file doesn't start with valid ASCII characters. This *might* help
; protect these files from tampering by programs that convert newline
; characters in text files (e.g., FTP programs).
;
; (3) It gives us the option of tweaking our encoding. Today we use distinct
; magic numbers to identify V1, V2, and V3 files, and in the future we
; could add additional encodings by adding other magic numbers.
;
;
; Naturals. Our representation of NAT is slightly involved since we need to
; support arbitrary-sized natural numbers. We use a variable-length encoding
; where the most significant bit of each byte is 0 if this is the last piece of
; the number, or 1 if there are additional bytes, and the other 7 bits are data
; bits. The bytes are kept in little-endian order. For example:
;
; 0 is encoded as #x00 (continue bit: 0, data: 0)
; 2 is encoded as #x02 (continue bit: 0, data: 2)
; ...
; 127 is encoded as #x7F (continue bit: 0, data: 127)
; 128 is encoded as #x80 #x01 [(c: 1, d: 0) = 0] + 128*[(c: 0, d: 1) = 1]
; 129 is encoded as #x81 #x01 [(c: 1, d: 1) = 1] + 128*[(c: 0, d: 1) = 1]
; 519 is encoded as #x87 #x04 [(c: 1, d: 7) = 7] + 128*[(c: 0, d: 4) = 4]
; 16383 is encoded as #xFF #x7F
; [(c: 1, d: 127) = 127] + 128 * [(c: 0, d: 127) = 127]
; 16384 is encoded as #x80 #x80 #x01
; [(c: 1, d: 0) = 0] + 128 * [(c: 0, d: 0) = 0]
; + 128^2*[(c: 0, d: 1) = 1]
; 16387 is encoded as #x83 #x80 #x01
; [(c: 1, d: 3) = 3] + 128 * [(c: 0, d: 0) = 0]
; + 128^2*[(c: 0, d: 1) = 1]
; ...
;
;
; Negative Integers. Negative integers aren't mentioned in the file format
; because we encode them as rationals with denominator 1. This only requires 2
; bytes of overhead (for the sign and denominator) beyond the magnitude of the
; integer, which seems acceptable since negative integers aren't especially
; frequent.
;
;
; Conses. Every object in the file has an (implicit) index, determined by its
; position in the file. The naturals are given the smallest indices 0, 1, 2,
; and so on. Supposing there are N naturals, the rationals will have indices
; N, N+1, etc. After that we have the complexes, then the characters, strings,
; symbols, and finally the conses.
;
; We encode conses using these indices. For instance, suppose the first two
; natural numbers in our file are 37 and 55. Since we start our indexing with
; the naturals, 37 will have index 0 and 55 will have index 1. Then, we can
; encode the cons (37 . 55) by just writing down these two indices, e.g., #x00
; #x01.
;
; We insist that sub-trees of conses come first in the file, so as we are
; decoding the file, whenever we construct a cons we can make sure its indices
; refer to already-constructed conses.
;
; The object with the maximal index is "the object" that has been saved,
; and is returned by the #Y and #Z readers, or by the whole-file reader,
; serialize-read.
;
;
; The V2 Format. The V2 format is almost the same as the V1 format, but with
; the following changes that allow us to restore the normedness of conses.
;
; (1) The magic number changes to #xAC120BC8, so we know which format is
; being used,
;
; (2) We tweak the way indices are handled so that NIL and T are implicitly
; given index 0 and 1, respectively (i.e., we do not actually write out
; those symbols), which can sometimes slightly improve compression for
; cons structures that have lots of NILs and Ts within them (since
; without this tweak, NIL and T might have large indices that are
; represented using many bytes).
;
; (3) We change the way conses are represented so we can mark which conses
; were normed. Instead of recording a cons by writing down its car-index
; and cdr-index verbatim, we now instead write down:
;
; (car-index << 1) | (if honsp 1 0), cdr-index
;
; Because of the way we encode naturals, this neatly only costs an extra
; byte if the car-index happens to have an integer length that is a
; multiple of 7. (To see this, note that our encoding is essentially a
; base-128 encoding, so we need an extra "digit" only when the top 7-bit
; "digit" has its top bit set to 1.)
;
; (4) Instead of the total number of objects, we replace LEN with the maximum
; index of the object to be read. Usually this just means that instead
; of LEN we record LEN - 1. But it allows us to detect the special case
; of NIL where the object being encoded is not necessarily at LEN - 1.
;
;
; The V3 format. The V3 format is almost the same as the V2 format, but with
; the following changes that allow us to restore fast alists.
;
; (1) The magic number changes to #xAC120BC9, and
;
; (2) We extend the OBJECT format with an extra FALS field, that comes
; after the conses. Note that LEN is unchanged and does not count
; the FALS.
;
; OBJECT ::= MAGIC ; marker for sanity checking
; LEN ; total number of objects
; NATS RATS COMPLEXES CHARS ; object data
; STRS PACKAGES CONSES ;
; FALS
; MAGIC ; marker for sanity checking
;
; FALS ::= FAL* 0 ; zero-terminated list
;
; FAL ::= NAT NAT ; index and hash-table-count
;
; (3) We change the encoding of STR so that we can mark which strings were
; normed. In place of recording the string's LEN, we instead record:
; (len << 1) | (if honsp 1 0)
; -----------------------------------------------------------------------------
;
; PRELIMINARIES
;
; -----------------------------------------------------------------------------
(defparameter *ser-verbose* nil)
(defmacro ser-time? (form)
`(if *ser-verbose*
(time ,form)
,form))
(defmacro ser-print? (msg &rest args)
`(when *ser-verbose*
(format t ,msg . ,args)))
; To make it easy to switch the kind of input/output stream being used, all of
; our stream reading/writing is done with the following macros.
;
; In previous versions of serialize we used binary streams and wrote/read from
; them with write/read-byte on most Lisps; on CCL we used memory-mapped files
; for better performance while reading. But we had to switch to using ordinary
; character streams to get compatibility with the Common Lisp reader (where we
; use #Y and #Z), which reads character streams.
(defmacro ser-write-char (x stream)
`(write-char (the character ,x) ,stream))
(defmacro ser-write-byte (x stream)
`(ser-write-char (code-char (the (unsigned-byte 8) ,x)) ,stream))
(defmacro ser-read-char (stream)
; Note that Lisp's read-char causes an end-of-file error (HyperSpec says that
; it "is signaled") if EOF is reached, so we don't have to detect unexpected
; EOFs in our decoding routines.
`(the character (read-char ,stream)))
(defmacro ser-read-byte (stream)
; How do we know that the char-code is 8 bits when reading a file stream? We
; try to enforce this in acl2-set-character-encoding. The magic number might
; save us if this changes, since its first byte is #xAC = 172 > 127 and will
; presumably be misread if we are using a file character encoding other than
; iso-8859-1.
`(the (unsigned-byte 8) (char-code (ser-read-char ,stream))))
(defun ser-encode-magic (stream)
;; We only write V3 files now, so we write AC120BC9 instead of C8 or C7.
(ser-write-byte #xAC stream)
(ser-write-byte #x12 stream)
(ser-write-byte #x0B stream)
(ser-write-byte #xC9 stream))
(defun ser-decode-magic (stream)
;; Returns :V1, :V2, or :V3, or causes an error.
(let* ((magic-1 (ser-read-byte stream))
(magic-2 (ser-read-byte stream))
(magic-3 (ser-read-byte stream))
(magic-4 (ser-read-byte stream)))
(declare (type (unsigned-byte 8) magic-1 magic-2 magic-3 magic-4))
(let ((version (and (= magic-1 #xAC)
(= magic-2 #x12)
(= magic-3 #x0B)
(cond ((= magic-4 #xC7) :v1)
((= magic-4 #xC8) :v2)
((= magic-4 #xC9) :v3)
(t nil)))))
(unless version
(error "Invalid serialized object, magic number incorrect: ~X ~X ~X ~X"
magic-1 magic-2 magic-3 magic-4))
version)))
; -----------------------------------------------------------------------------
;
; ENCODING AND DECODING NATURALS
;
; -----------------------------------------------------------------------------
; WHY DO WE USE 8-BIT BLOCKS?
;
; Originally I tried using 64-bit blocks. I thought this would mean only 1/64
; of the bits would be "overhead" for continue-bits, and surely this would be
; better than using 8-bit blocks, where 1/8 of the bits would be continue-bit
; overhead.
;
; This thinking is totally wrong. It ignores another important source of
; overhead: the unnecessary data-bits in the final block. To make this very
; concrete, think about encoding the number 5. We only "need" 3 bits. In an
; 8-bit encoding, we use 8 bits so the overhead is 5/8 = 62%. But in a 64-bit
; encoding we would need 64 bits for an overhead of 61/64 = 95%. So the 8-bit
; encoding is much more efficient for small integers.
;
; Another potential disadvantage of 64-bit blocks may seem to be that 64-bit
; numbers are not fixnums in CCL (or perhaps any Lisp circa 2014 or for years
; to come). However, that issue is probably actually quite minor; 8 8-bit
; fixnums quite possibly use more memory than one 64-bit bignum.
;
; Of course, there are cases where 64-bit blocks win. For instance, 2^62
; nicely fits into a single 64-bit block, but requires 9 8-bit blocks (at 7
; data bits apiece), i.e., 72 bits. But on some other larger numbers, 8-bit
; blocks can still be more efficient. Take 2^64. Here, we need either 2
; 64-bit blocks (at 63 data bits apiece) for 128 bits, or 10 8-bit blocks for
; 80 bits. In short, the wider encoding only wins when the numbers are very
; long, or when there aren't very many unnecessary data bits in the final
; block.
; WHY ALL THESE OPTIMIZATIONS?
;
; The performance of natural number encoding/decoding is especially important
; because we have to (1) encode/decode two naturals for every cons, and (2)
; encode/decode naturals all over the place for string lengths, symbol name
; lengths, and the representation of any number. These optimizations are a big
; deal: on one example benchmark, they improve CCL's decoding performance by
; almost 20% (as opposed to using just ser-encode-nat-large to encode
; naturals).
(defun ser-encode-nat-fixnum (n stream)
; Optimized encoder that assumes N is a fixnum.
(declare (type fixnum n))
(loop while (>= n 128)
do
(ser-write-byte (logior
;; The 7 low data bits
(the fixnum (logand n #x7F))
;; A continue bit
#x80)
stream)
(setq n (the fixnum (ash n -7))))
(ser-write-byte n stream))
(defun ser-encode-nat-large (n stream)
; Safe encoder that doesn't assume how large N is.
(declare (type (integer 0 *) n))
(loop until (typep n 'fixnum)
do
;; Fixnums are at least (signed-byte 16) in Common Lisp, so we must
;; be in the large case, i.e., n > 128.
(ser-write-byte (logior
;; The 7 low data bits
(the fixnum (logand (the integer n) #x7F))
;; A continue bit
#x80)
stream)
(setq n (the integer (ash n -7))))
(ser-encode-nat-fixnum n stream))
(defmacro ser-encode-nat (n stream)
; This is kind of silly, but it lets us avoid the function overhead of calling
; ser-encode-nat-large in the very common case that N is a fixnum.
(when (eq stream 'n)
(error "~s called with stream = N, which would cause capture!"
'ser-encode-nat))
`(let ((n ,n))
(if (typep n 'fixnum)
(ser-encode-nat-fixnum n ,stream)
(ser-encode-nat-large n ,stream))))
(defun ser-decode-nat-large (shift value stream)
; Simple (but unoptimized) natural number decoder that doesn't assume anything
; is a fixnum. Shift is 7 times the current block number we are reading, and
; represents how much we need to shift over the next 7 bits we read. Value is
; the already-summed value of the previous blocks we have read.
(declare (type integer value shift))
(let ((x1 (ser-read-byte stream)))
(declare (type fixnum x1))
(loop while (>= x1 128)
do
(incf value (ash (- x1 128) shift))
(incf shift 7)
(setf x1 (ser-read-byte stream)))
(incf value (ash x1 shift))
value))
(defmacro ser-decode-nat-body (shift)
; See SER-NAT-DECODE; this is accounting for different fixnum sizes across Lisps
; by unrolling the loop with a recursive macro. SHIFT is a constant that is
; being incremented by 7 on each "iteration". An invariant that is important to
; the fixnum optimizations is that VALUE is always less than 2^SHIFT.
(if (> (expt 2 (+ 7 shift)) most-positive-fixnum)
;; Can't unroll any further because we've reached the fixnum size, so fall
;; back to using the large decoder.
`(ser-decode-nat-large ,shift value stream)
`(progn
(setq x1 (ser-read-byte stream))
;; Reusing X1 is kind of goofy, but seems to result in better code on
;; CCL.
(cond
((< x1 128)
;; The returned VALUE + (X1 << SHIFT) is a fixnum since it is less than
;; 2^(7+SHIFT), which above we checked is a fixnum.
(setq x1 (the fixnum (ash x1 ,shift)))
(the fixnum (+ value x1)))
(t
;; Else, we increment value by (x1 - 128) << SHIFT. This is still a
;; fixnum because (x1 - 128) < 2^7, so the sum is under 2^(7+SHIFT).
;; To see this let x2=x1-128, s=SHIFT, and v=value; then since v<2^s
;; and x2 <= 2^7-1, we have:
;; value + (x1-128)<<s = v + x2*2^s < 2^s + (2^7-1)*2^s = 2^(7+s).
(setq x1 (the fixnum (- x1 128)))
(setq x1 (the fixnum (ash x1 ,shift)))
(setq value (the fixnum (+ value x1)))
;; Recursive macro expansion to unroll further.
(ser-decode-nat-body ,(+ 7 shift)))))))
(defun ser-decode-nat (stream)
; Optimized natural-number decoder. For small enough integers (under 128) we
; don't need any shifting nonsense or even an accumulator. For anything larger,
; we set up the initial VALUE accumulator and use our macro to write an
; unrolled, fixnum-optimized loop for us.
(let ((x1 (ser-read-byte stream)))
(declare (type fixnum x1))
(when (< (the fixnum x1) 128)
(return-from ser-decode-nat x1))
(setq x1 (the fixnum (- x1 128)))
(let ((value x1))
(declare (type fixnum value))
(ser-decode-nat-body 7))))
; -----------------------------------------------------------------------------
;
; ENCODING AND DECODING OTHER BASIC OBJECTS
;
; -----------------------------------------------------------------------------
; RAT ::= NAT NAT NAT ; sign (0 or 1), numerator, denominator
(declaim (inline ser-encode-rat ser-decode-rat))
(defun ser-encode-rat (x stream)
(declare (type rational x))
(ser-encode-nat (if (< x 0) 1 0) stream)
(ser-encode-nat (abs (numerator x)) stream)
(ser-encode-nat (denominator x) stream))
(defun ser-decode-rat (stream)
(let* ((sign (ser-decode-nat stream))
(numerator (ser-decode-nat stream))
(denominator (ser-decode-nat stream)))
(declare (type integer sign numerator denominator))
(cond ((= sign 1)
(setq numerator (- numerator)))
((= sign 0)
;; Fine, but there is nothing to do.
nil)
(t
;; This check probably isn't necessary; we could just assume that the
;; sign is zero. But it seems cheap enough and basically reasonable.
(error "Trying to decode rational, but the sign is invalid.")))
(when (= denominator 0)
;; This check probably isn't necessary since the Lisp should probably
;; signal an error if we try to divide by zero, but it seems cheap enough
;; and is probably basically reasonable.
(error "Trying to decode rational, but the denominator is zero."))
(the rational (/ numerator denominator))))
; COMPLEX ::= RAT RAT ; real, imaginary parts
(declaim (inline ser-encode-complex ser-decode-complex))
(defun ser-encode-complex (x stream)
(declare (type complex x))
(ser-encode-rat (realpart x) stream)
(ser-encode-rat (imagpart x) stream))
(defun ser-decode-complex (stream)
(let* ((realpart (ser-decode-rat stream))
(imagpart (ser-decode-rat stream)))
(declare (type rational realpart imagpart))
(when (= imagpart 0)
;; Hrmn. This check is probably unnecessary. (complex 3 0) is just 3.
;; Our encoder should never encode natural numbers as complexes, but it
;; wouldn't necessarily be wrong to do so.
(error "Trying to decode complex, but the imagpart is zero."))
(complex realpart imagpart)))
; (v1/v2): STR ::= LEN CHAR* ; length and then its characters
; (v3): STR ::= [(LEN << 1) | (if normedp 1 0)] CHAR*
; Note that our symbol encoding/decoding stuff piggy-backs on our string stuff,
; so we care about string encoding/decoding performance a bit.
;
; A very minor note is that in Common Lisp, the length of a string must be a
; fixnum (a string is a vector, which is a one-dimensional array, and hence its
; size must be less than the array-dimension-limit, which is a fixnum.)
(declaim (inline ser-encode-str ser-decode-str))
(defun ser-encode-str (x stream)
(declare (type string x))
(let* ((len (length x))
(normedp (hl-hspace-normedp-wrapper x))
(header (logior (ash len 1) (if normedp 1 0))))
(ser-encode-nat header stream)
(loop for n fixnum from 0 below (the fixnum len) do
(ser-write-char (char x n) stream))))
(defun ser-decode-str (version hons-mode stream)
(let* ((header (ser-decode-nat stream))
(len (if (eq version :v3)
(ash header -1)
header))
(normp (and (eq version :v3)
(= (the bit (logand header 1)) 1)
(not (eq hons-mode :never)))))
(unless (and (typep len 'fixnum)
(< (the fixnum len) array-dimension-limit))
; Sanity check: shouldn't normally happen if we encoded with ser-encode-str,
; though could perhaps happen if that was done with a different Lisp
; implementation.
(error "Trying to decode a string, but the length is too long."))
(let ((str (make-string (the fixnum len))))
(declare (type vector str))
(loop for i fixnum from 0 below (the fixnum len) do
(setf (schar str i) (ser-read-char stream)))
(if normp
(hons-copy str)
str))))
; -----------------------------------------------------------------------------
;
; ENCODING AND DECODING BASIC OBJECT LISTS
;
; -----------------------------------------------------------------------------
; We now build upon our encoders/decoders for individual elements, and write
; versions to deal with lists of naturals, rationals, etc.
(defstruct ser-decoder
; The decoder's state mainly consists of an ARRAY and a FREE index. As the file
; is decoded, ARRAY gets populated from zero on up, with FREE always pointing to
; the next available slot. Since array sizes are always fixnums, we know that
; FREE is always a fixnum.
(array (make-array 0) :type simple-vector)
(free 0 :type fixnum)
; The decoder also knows which file format we are decoding (i.e., :v1, :v2,
; :v3). This is set based on the magic number from the start of the file.
(version nil))
; NATS ::= LEN NAT* ; number of nats, followed by that many nats
(defun ser-encode-nats (x stream)
(let ((len (length x)))
(ser-print? "; Encoding ~a naturals.~%" len)
(ser-encode-nat len stream)
(dolist (elem x)
(ser-encode-nat elem stream))))
(defun ser-decode-and-load-nats (decoder stream)
(declare (type ser-decoder decoder))
(let* ((len (ser-decode-nat stream))
(arr (ser-decoder-array decoder))
(free (ser-decoder-free decoder))
(stop (+ free len)))
(declare (type fixnum free))
(ser-print? "; Decoding ~a naturals.~%" len)
(unless (<= stop (length arr))
;; Note that we need just one bounds check for the whole list of naturals.
(error "Invalid serialized object, too many naturals."))
(loop until (= (the fixnum stop) free) do
(setf (svref arr free) (ser-decode-nat stream))
(incf free))
(setf (ser-decoder-free decoder) stop)))
; RATS ::= LEN RAT* ; number of rats, followed by that many rats
(defun ser-encode-rats (x stream)
(let ((len (length x)))
(ser-print? "; Encoding ~a rationals.~%" len)
(ser-encode-nat len stream)
(dolist (elem x)
(ser-encode-rat elem stream))))
(defun ser-decode-and-load-rats (decoder stream)
(declare (type ser-decoder decoder))
(let* ((len (ser-decode-nat stream))
(arr (ser-decoder-array decoder))
(free (ser-decoder-free decoder))
(stop (+ free len)))
(declare (type fixnum free))
(ser-print? "; Decoding ~a rationals.~%" len)
(unless (<= stop (length arr))
(error "Invalid serialized object, too many rationals."))
(loop until (= (the fixnum stop) free) do
(setf (svref arr free) (ser-decode-rat stream))
(incf free))
(setf (ser-decoder-free decoder) stop)))
; COMPLEXES ::= LEN COMPLEX* ; number of complexes, followed by that many complexes
(defun ser-encode-complexes (x stream)
(let ((len (length x)))
(ser-print? "; Encoding ~a complexes.~%" len)
(ser-encode-nat len stream)
(dolist (elem x)
(ser-encode-complex elem stream))))
(defun ser-decode-and-load-complexes (decoder stream)
(declare (type ser-decoder decoder))
(let* ((len (ser-decode-nat stream))
(arr (ser-decoder-array decoder))
(free (ser-decoder-free decoder))
(stop (+ free len)))
(declare (type fixnum free))
(ser-print? "; Decoding ~a complexes.~%" len)
(unless (<= stop (length arr))
(error "Invalid serialized object, too many complexes."))
(loop until (= (the fixnum stop) free) do
(setf (svref arr free) (ser-decode-complex stream))
(incf free))
(setf (ser-decoder-free decoder) stop)))
; CHARS ::= LEN CHAR* ; number of characters, followed by that many chars
(defun ser-encode-chars (x stream)
(let ((len (length x)))
(ser-print? "; Encoding ~a characters.~%" len)
(ser-encode-nat len stream)
(dolist (elem x)
(ser-write-char elem stream))))
(defun ser-decode-and-load-chars (decoder stream)
(declare (type ser-decoder decoder))
(let* ((len (ser-decode-nat stream))
(arr (ser-decoder-array decoder))
(free (ser-decoder-free decoder))
(stop (+ free len)))
(declare (type fixnum free))
(ser-print? "; Decoding ~a characters.~%" len)
(unless (<= stop (length arr))
(error "Invalid serialized object, too many characters."))
(loop until (= (the fixnum stop) free) do
(setf (svref arr free) (ser-read-char stream))
(incf free))
(setf (ser-decoder-free decoder) stop)))
; STRS ::= LEN STR* ; number of strings, followed by that many strs
(defun ser-encode-strs (x stream)
(let ((len (length x)))
(ser-print? "; Encoding ~a strings.~%" len)
(ser-encode-nat len stream)
(dolist (elem x)
(ser-encode-str elem stream))))
(defun ser-decode-and-load-strs (hons-mode decoder stream)
(declare (type ser-decoder decoder))
(let* ((len (ser-decode-nat stream))
(arr (ser-decoder-array decoder))
(free (ser-decoder-free decoder))
(version (ser-decoder-version decoder))
(stop (+ free len)))
(declare (type fixnum free))
(ser-print? "; Decoding ~a strings.~%" len)
(unless (<= stop (length arr))
(error "Invalid serialized object, too many strings."))
(loop until (= (the fixnum stop) free) do
(setf (svref arr free) (ser-decode-str version hons-mode stream))
(incf free))
(setf (ser-decoder-free decoder) stop)))
; -----------------------------------------------------------------------------
;
; ENCODING AND DECODING SYMBOLS
;
; -----------------------------------------------------------------------------
; We don't want to pay the price of writing down the package for every symbol
; individually, since most of the time an object will probably contain lots of
; symbols from the same package. So, our basic approach is to organize the
; symbols into groups by their package names, and then for each package we write
; out: the name of the package, and the list of symbol names.
;
; See also the Essay on Bad Objects and Serialize. When we are decoding, we
; optionally check that packages are known to ACL2 by calling pkg-witness, which
; causes an error if it the package isn't known. Note that we only have to do
; this once per package, so this is a very low-cost check.
;
; If checking packages is so cheap, why not just check packages all the time?
; We tried that originally, but sometimes ACL2 actually DOES read in bad
; objects, e.g., foo@expansion.lsp may have *1* symbols in it, etc. So we need
; to avoid complaining if ACL2 is using the #Y or #Z reader when reading these
; files.
; PACKAGE ::= STR LEN STR* ; package name, number of symbols, symbol names
(defun ser-encode-package (pkg x stream)
(declare (type string pkg))
(let ((len (length x)))
(ser-print? "; Encoding ~a symbols for ~a package.~%" len pkg)
(ser-encode-str pkg stream)
(ser-encode-nat (length x) stream)
(dolist (elem x)
(ser-encode-str (symbol-name elem) stream))))
(defun ser-decode-and-load-package (check-packagesp decoder stream)
; We always use hons-mode :never below, because there's no need to norm the
; package or symbol names since we're going to intern them and not return them.
; The point here is that it is difficult to control norming of symbol-names;
; imagine for example reading FOO::X and then later (defconst *a* (intern
; (hons-copy "X") "FOO")). Then (symbol-name 'foo::x) will not be normed.
(declare (type ser-decoder decoder))
(let* ((version (ser-decoder-version decoder))
(arr (ser-decoder-array decoder))
(free (ser-decoder-free decoder))
(pkg-name (ser-decode-str version :never stream))
(len (ser-decode-nat stream))
(stop (+ free len)))
(declare (type fixnum free))
(ser-print? "; Decoding ~a symbols for ~a package.~%" len pkg-name)
(unless (<= stop (length arr))
(error "Invalid serialized object, too many symbols."))
(when check-packagesp
(pkg-witness pkg-name))
(loop until (= (the fixnum stop) free) do
(setf (svref arr free)
(let ((temp (ser-decode-str version :never stream)))
; We use *read-suppress* to avoid trying to handle symbols in contexts like
; #+sbcl (sb-ext:inhibit-warnings 3)
; where the feature is false (as in CCL for the example above).
(if *read-suppress* nil (intern temp pkg-name))))
(incf free))
(setf (ser-decoder-free decoder) stop)))
; PACKAGES ::= LEN PACKAGE* ; number of packages, followed by that many packages
(defun ser-encode-packages (alist stream)
; Alist maps package-names to the lists of their symbols.
(let ((len (length alist)))
(ser-print? "; Encoding symbols for ~a packages.~%" len)
(ser-encode-nat len stream)
(dolist (entry alist)
(ser-encode-package (car entry) (cdr entry) stream))))
(defun ser-decode-and-load-packages (check-packagesp decoder stream)
(declare (type ser-decoder decoder))
(let ((len (ser-decode-nat stream)))
(ser-print? "; Decoding symbols for ~a packages.~%" len)
(loop for i from 1 to len do
(ser-decode-and-load-package check-packagesp decoder stream))))
; -----------------------------------------------------------------------------
;
; PREPARING OBJECTS FOR ENCODING
;
; -----------------------------------------------------------------------------
(defun ser-hashtable-init (size test)
; For good performance, it is critical that we aggressively resize the hash
; tables that are used in the atom-gathering phase of encoding. This is just a
; wrapper for making hash tables with more aggressive rehash sizes.
(make-hash-table :size size
:test test
:rehash-size 2.2
#+ccl :shared #+ccl nil
))
(defstruct ser-encoder
; This object bundles the state of the encoder.
;
; The first phase of encoding is SER-GATHER-ATOMS. The goal is to quickly
; collect all of the atoms in the object, without duplication, and partition
; them into lists by their types.
;
; To avoid repeatedly collecting the same atoms, we use four "seen" tables that
; keep track of which objects we have explored. As we gather atoms, we mark the
; objects we have seen by binding them to T in the appropriate hash table.
;
; Every symbol we have seen is in the SYM hash table, and every
; number/character we have seen is in the EQL hash table. But the string and
; cons tables are only EQ hash tables. Because of this, EQUAL-but-not-EQ
; strings and conses may be bound in their seen tables.
;
; We make no effort to avoid "redundantly" writing EQUAL-but-not-EQ conses or
; strings. Of course, a HONS user could first hons-copy their object to
; achieve full structure sharing. This would perhaps improve read time at the
; cost of write time.
(seen-sym (ser-hashtable-init 1000 'eq) :type hash-table)
(seen-eql (ser-hashtable-init 1000 'eql) :type hash-table)
(seen-str (ser-hashtable-init 1000 'eq) :type hash-table)
(seen-cons (ser-hashtable-init 2000 'eq) :type hash-table)
; In addition to the above seen tables, the encoder has several accumulators
; which collect the atoms it finds during the GATHER-ATOMS phase. The basic
; idea here is to separate these objects by their types, so that we can then
; write them out using our encoders for lists of naturals, rationals, etc.
;
; The accumulators for naturals, rationals, complexes, strings, and characters
; are simple lists that we push new values into. Because of our seen-tables, we
; can guarantee that the accumulators for naturals, rationals, complexes, and
; characters have no duplicates. However, the strings accumulator may contain
; duplicates in the sense that two members might be equal but not eq.
(naturals nil :type list)
(rationals nil :type list)
(complexes nil :type list)
(chars nil :type list)
(strings nil :type list)
; The symbol accumulator is more complex. The SYMBOL-HT is a hash table that
; associates packages with the lists of symbols found in that package. Once we
; are done gathering atoms, we map over this hash table to convert it into an
; alist (SYMBOL-AL). This conversion is cheap; it only requires one cons per
; package.
(symbol-ht (ser-hashtable-init 60 'eq) :type hash-table)
(symbol-al nil :type list)
; The free-index here is only used in ser-encode-conses. Bundling it with the
; encoder's state is beneficial in two ways for ser-encode-conses: it reduces
; stack size requirements by eliminating a parameter, and simplifies the flow
; because we don't need to return multiple values.
(free-index 0 :type fixnum)
; The stream that we are writing into. Bundling this into the encoder instead
; of passing it as an extra argument helps to reduce the stack size
; requirements for ser-encode-conses.
(stream nil)
)
(defmacro ser-see-obj (x table)
; Mark X as seen, and return T/NIL based on whether it was previously seen.
`(let ((x ,x)
(tbl ,table))
(cond ((gethash x tbl) t)
(t (setf (gethash x tbl) t)
nil))))
(defun ser-gather-atoms (x encoder)
; Gathering atoms is particularly performance critical, so we have looked into
; making it faster. We assume X is a valid ACL2 object. We do some typep
; checks in a few cases where using ordinary recognizers seems to be slower.
; But this does not gain us much, because almost all of the time seems to be
; spent on hashing.
;
; Sol Swords uses a destructive hashing scheme in his AIGER writer which we
; could probably adapt for use here, and it would probably lead to significant
; performance gains. However, anything destructive is scary with respect to
; multithreaded code, and we don't want to use it unless we really have no
; other choice.
(declare (type ser-encoder encoder))
(cond ((consp x)
(unless (ser-see-obj x (ser-encoder-seen-cons encoder))
(ser-gather-atoms (car x) encoder)
(ser-gather-atoms (cdr x) encoder)))
((symbolp x)
;; V2 change: do not gather T and NIL into the accumulator for
;; symbols. They are implicit in the v2 format.
(unless (or (eq x t)
(eq x nil)
(ser-see-obj x (ser-encoder-seen-sym encoder)))
(push x (gethash (symbol-package x)
(ser-encoder-symbol-ht encoder)))))
((typep x 'fixnum)
;; This case is probably common enough to add explicitly, since this
;; fixnum check is so fast.
(unless (ser-see-obj x (ser-encoder-seen-eql encoder))
(if (< (the fixnum x) 0)
(push x (ser-encoder-rationals encoder))
(push x (ser-encoder-naturals encoder)))))
((typep x 'array) ; <-- (stringp x), but twice as fast in CCL.
(unless (ser-see-obj x (ser-encoder-seen-str encoder))
(push x (ser-encoder-strings encoder))))
;; Performance is probably already pretty bad at this point, because of
;; all the typep checks above.
(t
(unless (ser-see-obj x (ser-encoder-seen-eql encoder))
(cond ((typep x 'character)
(push x (ser-encoder-chars encoder)))
((typep x 'integer)
(if (< x 0)
(push x (ser-encoder-rationals encoder))
(push x (ser-encoder-naturals encoder))))
((rationalp x)
(push x (ser-encoder-rationals encoder)))
((complex-rationalp x)
(push x (ser-encoder-complexes encoder)))
(t
(error "ser-gather-atoms-types given non-ACL2 object.")))))))
; Essay on How We Handle Equal-but-not-eq Strings
;
; A subtle change in V3 is that we no longer make any effort to avoid
; redundantly encoding EQUAL-but-not-EQ or strings.
;
; When I first developed serialize, I wanted to use it to save the models of
; our processor. My Verilog translator produced objects with lots of strings,
; and in many cases these strings could be different, e.g., if you parsed a
; module with:
;
; module m ( ..., foo, ... );
; input foo;
; assign ... = foo;
; ...
; endmodule
;
; then you could end up with lots of different strings that all said "foo".
; Note also that in this context, I really wanted to optimize for read time
; instead of write time (a script made the processor models at night, where
; time is irrelevant, but I needed to read them in while developing proofs.)
; At any rate, for these reasons, I really wanted to avoid writing out
; redundant strings.
;
; My first idea was to just use an EQUAL hash table for seen-str, but this
; turned out to be far too slow.
;
; Instead, I developed a fancy scheme. First, the seen-str was only an EQ hash
; table. But when encoding the collected strings, I (1) sorted them using
; Lisp's SORT function, which is a stable sort, and (2) assigned their indices
; using a tricky function that looked for adjacent EQUAL strings, and reused
; the index in this case. The net effect was that all strings would be
; canonicalized to some representative. This was probably a performance win
; because, instead of having to EQUAL-hash at each occurrence of the string, we
; only have to EQ hash and then do one global sort at the end. (It probably
; would have worked just as well to use separate EQ and EQUAL hash tables.)
;
; This scheme was used in V1 and V2. However, in V3, where we started to
; record whether strings were normed, and restore them to their normed status,
; we ran into a problem.
;
; Suppose there are two EQUAL-but-not-EQ strings,
; - X (which is normed) and
; - Y (which is not).
;
; Under our canonicalization scheme, we would choose one of these as the
; canonical form "at random." (Actually the winner just depended on which we
; first encountered, but you can think of that as basically random.) At any
; rate, this could two bad outcomes:
;
; (1) Y could be canonicalized to X, and hence become normed. This seems
; basically fine and is probably nothing to be worried about unless some
; TTAG code is depending on it being different from Y, which wouldn't be
; sound anyway).
;
; (2) X could be canonicalized to Y, and hence lose its normed status. This
; is bad because if X is being used as a fast alist key, we will have
; trouble restoring that alist.
;
; It seems easy enough to fix this by norming the string whenever ANY of its
; EQUAL-but-not-EQ partners are normed. But instead, we now just do not try to
; avoid writing out redundant strings. Why not?
;
; - It was somewhat complicated and ugly, and the fix seems kind of gross.
;
; - Sorting the strings slows down writing, and write speed is now more
; important than it used to be. We used to only write out our processor
; models with an automated script. But now serialize is used to print
; certificates, etc., and while write speed is certainly still less
; important than read speed, it is at least somewhat important.
;
; - EQUAL-but-not-EQ strings don't seem that prevalent anyway, e.g., we now
; judiciously use HONS-COPY to normalize strings in the Verilog parser,
; etc., and other users can do the same if they think EQUAL-but-not-EQ
; copies of a string are likely to occur.
(defun ser-make-atom-map (encoder)
; After the atoms have been gathered we want to assign them unique indices.
; These indices will need to agree with the implicit order of the indices in
; the serialized file. So, we need to assign indices to the naturals first,
; then the rationals, etc.
;
; In earlier versions of serialize, we constructed "atom map" structures that
; were hash tables binding atoms to their indices. These atom maps were new
; structures that were unrelated to the seen-tables above.
;
; But now, for considerably better performance and memory efficiency, we
; instead smash the seen-tables and convert them into index mappings. That is,
; during SER-GATHER-ATOMS above, the seen tables just bound the objects we had
; seen to T. Now we are going to smash these bindings and replace them with
; their indices. This is especially efficient because their hash tables have
; already been grown to the proper sizes.
;
; Before we this smashing process, we check that the maximum index we will ever
; need is going to be a fixnum. Because of this, throughout this code we can
; assume that all indices are always fixnums.
;
; Future optimization potential. It might be possible to do the index
; assignment inline with atom gathering, by keeping separate track of how many
; naturals we have seen, how many characters, etc., and storing only
; type-relative offsets into the atom maps instead of absolute indices. This
; might be worthwhile: it should significantly reduce the amount of hashing we
; need to do.
;; Note: this order must agree with ser-encode-atoms.
(let ((free-index 2)
;; In v2, the first free index is 2 (nil and t are implicitly 0 and 1)
(seen-sym (ser-encoder-seen-sym encoder))
(seen-eql (ser-encoder-seen-eql encoder))
(seen-str (ser-encoder-seen-str encoder)))
(declare (type fixnum free-index)
(type hash-table seen-sym seen-eql seen-str))
(dolist (elem (ser-encoder-naturals encoder))
(setf (gethash elem seen-eql) free-index)
(incf free-index))
(dolist (elem (ser-encoder-rationals encoder))
(setf (gethash elem seen-eql) free-index)
(incf free-index))
(dolist (elem (ser-encoder-complexes encoder))
(setf (gethash elem seen-eql) free-index)
(incf free-index))
(dolist (elem (ser-encoder-chars encoder))
(setf (gethash elem seen-eql) free-index)
(incf free-index))
(dolist (elem (ser-encoder-strings encoder))
(setf (gethash elem seen-str) free-index)
(incf free-index))
;; Turn the hash table of symbols into an alist so that they're in the same
;; order now and when we encode. This might not be necessary, but it's
;; probably very cheap in practice because there's only one entry per
;; package.
(let ((al nil))
(maphash (lambda (key val)
(push (cons (package-name key) val) al))
(ser-encoder-symbol-ht encoder))
(setf (ser-encoder-symbol-al encoder) al))
(dolist (elem (ser-encoder-symbol-al encoder))
(dolist (sym (cdr elem))
;; We don't have to check for T and NIL because we didn't accumulate
;; them into the symbol table.
(setf (gethash sym seen-sym) free-index)
(incf free-index)))
;; V2 change: explicitly assign nil and t indices 0 and 1
(setf (gethash nil seen-sym) 0)
(setf (gethash t seen-sym) 1)
;; Finally, update the encoder with the free index we've arrived at.
(setf (ser-encoder-free-index encoder) free-index)))
; -----------------------------------------------------------------------------
;
; ENCODING AND DECODING CONSES
;
; -----------------------------------------------------------------------------
; After the atoms have been assigned their indices as above, we are going to
; write out a list of instructions for reassembling the conses in the object.
; We keep incrementing the free-index as we go so that the atoms and conses end
; up in a shared index-space. Just as we smashed the seen-tables for the
; atoms, we also smash the seen-cons table to reuse its space for the indices.
;
; In earlier versions of serialize, we separated the act of generating
; instructions from writing them. But now for greater efficiency we fuse the
; two operations so that we never need to record the instructions anywhere
; except in the stream.
;
; We call ser-encode-conses only after generating all of the atom maps,
; writing out all the atoms in the file, and writing the number of conses that
; we are about to build (which is available as the count of the seen-cons
; table.)
(defun ser-encode-conses
(x ; the object we are encoding, which we are recurring through
encoder ; the encoder's state (so we can look at all the tables)
)
"Returns X-INDEX"
(declare (type ser-encoder encoder))
(if (atom x)
;; Atoms already have their indices assigned, so there's nothing
;; to do but look them up.
(cond ((symbolp x)
(gethash x (ser-encoder-seen-sym encoder)))
((stringp x)
(gethash x (ser-encoder-seen-str encoder)))
(t
(gethash x (ser-encoder-seen-eql encoder))))
(let* ((seen-cons (ser-encoder-seen-cons encoder))
(idx (gethash x seen-cons)))
;; At this point you might expect to see something like, "(if idx ...)".
;; But since we are reusing the seen-cons table, every cons that does not
;; already have its index assigned is bound to T, not unbound. To see if
;; an index has been assigned, then, we have to check if it is a number.
;; Since all indices are fixnums, I check whether it's a fixnum, which is
;; very fast (just looking at type bits), at least on CCL.
(if (typep idx 'fixnum)
idx
(let* ((car-index (ser-encode-conses (car x) encoder))
(cdr-index (ser-encode-conses (cdr x) encoder))
;; At this point, we've assigned indices to the car and cdr.
;; We've also written out all of the instructions needed to
;; generate them in the stream. We can now assign an index to X
;; and write the instruction for rebuilding it:
(free-index (ser-encoder-free-index encoder))
(stream (ser-encoder-stream encoder))
;; V2 change: we now write (car-index << 1) | (if honsp 1 0)
;; instead of just car-index. Note that these fixnum
;; declarations are justified by the checking we do in
;; ser-encode-to-stream.
(v2-car-index
(if (hl-hspace-honsp-wrapper x)
(the fixnum (logior (the fixnum (ash car-index 1)) 1))
(the fixnum (ash car-index 1)))))
(declare (type fixnum car-index cdr-index v2-car-index free-index))
(setf (gethash x seen-cons) free-index)
(ser-encode-nat v2-car-index stream)
(ser-encode-nat cdr-index stream)
(setf (ser-encoder-free-index encoder) (the fixnum (+ 1 free-index)))
free-index)))))
(defmacro ser-decode-loop (version hons-mode)
`(loop until (= (the fixnum stop) free) do
(let ((first-index (ser-decode-nat stream)))
(unless (typep first-index 'fixnum)
(error "Consing instruction has non-fixnum first-index."))
(let ((car-index ,(if (eq version :v1)
'first-index
'(the fixnum (ash (the fixnum first-index) -1))))
(honsp ,(cond ((eq hons-mode :always)
't)
((and (eq hons-mode :smart)
(not (eq version :v1)))
'(logbitp 0 (the fixnum first-index)))
(t
nil)))
(cdr-index (ser-decode-nat stream)))
;; Performance testing suggests these bounds checks are
;; almost free.
(unless (and (typep cdr-index 'fixnum)
(< (the fixnum car-index) free)
(< (the fixnum cdr-index) free))
(error "Consing instruction has index out of bounds."))
(let ((car-obj (svref arr (the fixnum car-index)))
(cdr-obj (svref arr (the fixnum cdr-index))))
(setf (svref arr free)
(if honsp
(hons car-obj cdr-obj)
(cons car-obj cdr-obj)))
(incf free))))))
(defun ser-decode-and-load-conses (hons-mode decoder stream)
;; The valid hons modes are:
;; :always - always hons regardless of hons bits
;; :never - never hons regardless of hons bits
;; :smart - hons only when hons bits are set (v2/3 only)
;; smart does no honsing for v1 files
(declare (type ser-decoder decoder))
(let* ((len (ser-decode-nat stream))
(arr (ser-decoder-array decoder))
(free (ser-decoder-free decoder))
(version (ser-decoder-version decoder))
(stop (+ free len)))
(declare (type fixnum free))
(ser-print? "; Decoding ~a consing instructions.~%" len)
(unless (<= stop (length arr))
;; Like our other decoders, we only need a single bounds check to make
;; sure we won't overflow the array as we decode the conses.
(error "Invalid serialized object, too many conses."))
;; This is a gross hack so that we have five different loops, optimized for
;; the different cases of hons-mode and file version.
(if (eq version :v1)
(if (eq hons-mode :always)
(ser-decode-loop :v1 :always)
(ser-decode-loop :v1 :never))
;; v2/3 are the same here
(cond ((eq hons-mode :always)
(ser-decode-loop :v2 :always))
((eq hons-mode :never)
(ser-decode-loop :v2 :never))
(t
(ser-decode-loop :v2 :smart))))
(setf (ser-decoder-free decoder) stop)))
; -----------------------------------------------------------------------------
;
; FAST ALISTS
;
; -----------------------------------------------------------------------------
; See also the Essay on Fast Alists in hons-raw.lisp. The FAL-HT binds some
; alists to hash tables. Some of these alists might be conses that we've just
; written out. For each such alist, we just want to record its index.
;
; Our basic approach to restoring fast alists is as follows. First, we encode
; the whole object -- we gather its atoms, assign indices to them, and write
; out all the consing instructions -- without any regard to which alists inside
; of it are fast. Then we tack on some fast-alist information.
;
; Once the whole object has been encoded, we look at the FALTABLE from the Hons
; Space. Recall that the FALTABLE binds alists to their backing hash tables.
; For each entry in the FALTABLE, we check whether the alist has been assigned
; an index in the encoder's SEEN-CONS table. I.e., we check whether the alist
; was part of the object we just encoded. If so, we write down a FAL
; instruction that describes how to rebuild the hash table.
;
; This seems pretty efficient: each FAL instruction is only two natural
; numbers, so the real added cost to encoding is just N hash table lookups,
; where N is the size of the FALTABLE, and the cost of encoding 2M naturals
; where M is the number of fast alists actually in the object.
;
; An encoded FAL instruction is a pair of natural numbers, INDEX and COUNT.
; The INDEX is just the looked up index of the alist, itself. The COUNT is the
; hash-table-count of the backing hash table, so that when we recreate the hash
; table we can initialize it with approximately the correct size.
;
; So how does decoding work? Since the FAL instructions come at the end of the
; object, we already have restored the whole object by the time we get to them.
; In other words, the INDEX that we read tells us where, in the decode array,
; we can find the alist that was fast. Assuming we are using smart honsing or
; always honsing, then the keys of the ALIST should already be honsed, because
; it was a fast alist and so its keys had to be honses. So all we need to do
; is rebuild the hash table for this alist and install it into the FAL table,
; which is very easy.
;
; We zero-terminate the FALS instead of writing down the number of FALS first.
; This is legitimate because 0 is always the index of NIL, and NIL cannot be
; bound in the FALTABLE, so there is no ambiguity. It is desirable because as
; we encode, we do not know how many FAL instructions we will actually need to
; write out. Likewise, as we decode we do not need to know how many FAL
; instructions will be processed.
(defun ser-encode-fals (encoder)
(declare (type ser-encoder encoder))
; Q: Why include the hash table's count, when the decoder could just take the
; length of the alist instead?
;
; A: Since the alist can have shadowed pairs, so its length may not be a very
; good indication of the size we should use. We could end up with a much
; larger hash table than we really need.
;
; Q: Why use the hash-table-count instead of the hash-table-size?
;
; A: If a fast alist has been saved in a book, we think it is relatively
; unlikely that it is going to be modified by a sub-book. It seems more likely
; that it is some kind of lookup table that you intend to refer to over and
; over.
(let* ((stream (ser-encoder-stream encoder))
(seen-cons (ser-encoder-seen-cons encoder))
(fn
(lambda (alist backing-hash-table)
(let ((idx (gethash alist seen-cons)))
(when idx
(ser-encode-nat idx stream)
(ser-encode-nat (hash-table-count backing-hash-table)
stream))))))
(hl-faltable-maphash fn (hl-hspace-faltable-wrapper))
(ser-encode-nat 0 stream)))
(defun ser-decode-and-restore-fals (decoder hons-mode stream)
(declare (type ser-decoder decoder))
; Q: Why don't we restore when hons-mode is :never?
;
; A: The keys of a fast alist need to be honsed. If we didn't smartly (or
; dumbly) make them honses when we built the conses, then we may not be able to
; convert the alist into a fast alist. Sure, we could build a new alist that
; is EQUAL to the alist and make it fast, but then how would we install it? It
; wouldn't work to smash the entry in the decoder array -- the other conses
; that rely on it have already been built. The only way would be to walk
; through the object and replace all uses of the alist with the new alist, and
; that seems horribly slow. At any rate, it doesn't seem unreasonable to say,
; if you're reading the file with no honses, you get no fast alists either.
(let* ((array (ser-decoder-array decoder))
(max (length array))
(index (ser-decode-nat stream)))
(loop until (= index 0) do
(unless (< index max)
(error "FAL index to restore is too large!"))
(unless (eq hons-mode :never)
(let ((alist (svref array index))
(count (ser-decode-nat stream)))
(hl-restore-fal-for-serialize alist count)))
; Notice that we make progress reading nats from stream even if hons-mode is
; :never; we just ignore those nats!
(setq index (ser-decode-nat stream)))))
(defun ser-encode-atoms (encoder)
(declare (type ser-encoder encoder))
; It's sort of silly for this to be its own function, but it makes a convenient
; target for timing.
(let ((stream (ser-encoder-stream encoder)))
(ser-encode-nats (ser-encoder-naturals encoder) stream)
(ser-encode-rats (ser-encoder-rationals encoder) stream)
(ser-encode-complexes (ser-encoder-complexes encoder) stream)
(ser-encode-chars (ser-encoder-chars encoder) stream)
(ser-encode-strs (ser-encoder-strings encoder) stream)
(ser-encode-packages (ser-encoder-symbol-al encoder) stream)))
(defun ser-encode-to-stream (obj stream)
; Serialize the OBJ and write it to the stream. This writes "everything from
; magic number to magic number." Note that it does NOT include the #Z prefix,
; which is needed if you're going to read the object back in.
(let ((encoder (make-ser-encoder :stream stream))
starting-free-index-for-conses
total-number-of-objects
max-index
nconses)
(declare (dynamic-extent encoder))
;; Make sure the hons space is initialized
(hl-maybe-initialize-default-hs-wrapper)
(ser-time? (ser-gather-atoms obj encoder))
(setq nconses (hash-table-count (ser-encoder-seen-cons encoder)))
(unless (typep (ash (+ 2 ;; to account for T and NIL
(hash-table-count (ser-encoder-seen-sym encoder))
(hash-table-count (ser-encoder-seen-eql encoder))
(hash-table-count (ser-encoder-seen-str encoder))
nconses)
1)
'fixnum)
;; This check ensures that all indices will be fixnums. The sum above
;; may actually exceed the actual maximum index we will have, because
;; EQUAL-but-not-EQ strings will be removed. But it is at least as large
;; as the maximum index, so if it is a fixnum then all indices are
;; fixnums. In V1 we just checked if the sum was a fixnum. In V2 we
;; need to shift it since indices get shifted in the file. Note that we
;; rely on this check in ser-encode-conses.
(error "Maximum index exceeded."))
(ser-time? (ser-make-atom-map encoder))
(setq starting-free-index-for-conses (ser-encoder-free-index encoder))
(ser-encode-magic stream)
(setq total-number-of-objects
(the fixnum (+ starting-free-index-for-conses nconses)))
(ser-encode-nat (cond ((eq obj nil)
0)
(t
(- total-number-of-objects 1)))
stream)
(ser-time? (ser-encode-atoms encoder))
(ser-encode-nat nconses stream)
(setq max-index (ser-time? (ser-encode-conses obj encoder)))
(ser-time? (ser-encode-fals encoder))
(unless (and (equal (ser-encoder-free-index encoder)
total-number-of-objects)
(or (equal max-index (- (ser-encoder-free-index encoder) 1))
;; in v2, max-index can be 0 and 1 also, for nil or t.
;; if it happens to be t, then it's still going to be one
;; less than the max free index.
(equal max-index 0)))
(error "Sanity check failed in ser-encode-to-stream!~% ~
- final-free-index is ~a~% ~
- total-number-of-objects is ~a~% ~
- max-index is ~a~%"
(ser-encoder-free-index encoder)
total-number-of-objects
max-index))
(ser-encode-magic stream)))
(defun ser-decode-and-load-atoms (check-packagesp hons-mode decoder stream)
(declare (type ser-decoder decoder))
(ser-decode-and-load-nats decoder stream)
(ser-decode-and-load-rats decoder stream)
(ser-decode-and-load-complexes decoder stream)
(ser-decode-and-load-chars decoder stream)
(ser-decode-and-load-strs hons-mode decoder stream)
(ser-decode-and-load-packages check-packagesp decoder stream))
(defun ser-make-array-wrapper (size)
; Sol Swords reported in July 2017 that with SBCL, he has provided the
; following safety-3 wrapper to avoid some crashes in ser-decode-from-stream.
#+sbcl (declare (optimize (safety 3)))
(make-array size))
(defun ser-decode-from-stream (check-packagesp hons-mode stream)
; Warning: If you change the input or output signature of this function, change
; its (declaim (ftype ...)) form in acl2-fns.lisp.
; Read a serialized object from the stream. This reads "everything from magic
; number to magic number." Note that it does NOT expect there to be a #Z
; prefix.
(let* ((version (ser-decode-magic stream))
(size/idx (ser-decode-nat stream))
(arr-size (if (or (eq version :v2)
(eq version :v3))
(cond ((eq size/idx 0)
2)
(t
(+ size/idx 1)))
size/idx))
(final-obj (if (or (eq version :v2)
(eq version :v3))
size/idx
(- arr-size 1))))
(unless (typep arr-size 'fixnum)
(error "Serialized object is too large."))
(let* ((arr (ser-make-array-wrapper arr-size))
(decoder (make-ser-decoder :array arr
:free 0
:version version)))
(declare (dynamic-extent arr decoder)
(type ser-decoder decoder))
(when (or (eq version :v2)
(eq version :v3))
(setf (aref arr 0) nil)
(setf (aref arr 1) t)
(setf (ser-decoder-free decoder) 2))
(ser-print? "; Decoding serialized object of size ~a.~%" arr-size)
(ser-time? (ser-decode-and-load-atoms check-packagesp hons-mode decoder
stream))
(ser-time? (ser-decode-and-load-conses hons-mode decoder stream))
(when (eq version :v3)
(ser-decode-and-restore-fals decoder hons-mode stream))
(unless (eq (ser-decode-magic stream) version)
(error "Invalid serialized object, magic number mismatch."))
(unless (= (ser-decoder-free decoder) arr-size)
(error "Invalid serialized object.~% ~
- Decode-free is ~a~%
- Arr-size is ~a."
(ser-decoder-free decoder) arr-size))
;; Return the top object.
(svref arr final-obj))))
|