aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexis Lockwood2021-07-02 21:41:42 -0400
committerAlexis Lockwood2021-07-02 21:48:20 -0400
commit63b20eb0b660c990ec6cc1e77b215419582f18ac (patch)
treea91e10e2a488846a2ccd62609791a0b82bd9280a
parentbcf6ea4ca2d2ca6abc15429de52410f1985d7d61 (diff)
ls_minify: reversible ident minification
-rw-r--r--README.md9
-rw-r--r--ls_minify.c89
-rw-r--r--ls_minify_identgen.c77
-rw-r--r--ls_minify_identgen.h22
4 files changed, 157 insertions, 40 deletions
diff --git a/README.md b/README.md
index 6b0c97c..5cf54a4 100644
--- a/README.md
+++ b/README.md
@@ -114,12 +114,13 @@ The following things use pool entries:
the number of possible minified six-letter idents is enormous (about 52
billion if I did the math right) so if you're always going to minify idents
you might as well get full-length in the pre-min code.
-- It'd also be cool if `ls_minify` could save a table of minified idents for
- later unminifcation. It could even add them to the source as special comments
- as an in-between (they'd only be mentioned once each)
-- BUG? Something weird is happening when running minify_ident_test. `ls_minify`
+- BUG? Something weird is happening when running `minify_ident_test`. `ls_minify`
locks up somewhere suspiciously near `pc = UINT16_MAX / 2`. That file is
way bigger than the max though so I'll need to make a better test case.
+- Can we make the minifier accept code longer than the max? I think it'd have
+ to have its own build of the lexer... We could just make `ls_addr_t` into
+ `size_t`, that wouldn't have any effect on 8-bit anyway, though it'd make
+ the pool more wasteful on 32 bit.
# THE SCRIPTING LANGUAGE
diff --git a/ls_minify.c b/ls_minify.c
index 552a29e..c6abeae 100644
--- a/ls_minify.c
+++ b/ls_minify.c
@@ -31,7 +31,7 @@
// --- PRIVATE DATATYPES -------------------------------------------------------
typedef struct {
- char const * s;
+ uint8_t const * s;
size_t len;
} file_fetcher_t;
@@ -40,6 +40,7 @@ typedef struct {
ls_token_t last_tok;
file_fetcher_t * fet;
bool add_space;
+ bool eol_at_end;
FILE * f_out;
} minifier_t;
@@ -48,7 +49,7 @@ typedef struct {
static int _fetcher(void * arg, uint16_t loc);
static void _usage(char const * argv0, bool short_text);
-static void _minify(file_fetcher_t * fet, FILE * f_out);
+static void _minify(minifier_t * min, FILE * f_out);
static void _min_word_or_str_label(minifier_t * min, ls_token_t tok);
static void _min_number_or_num_label(minifier_t * min, ls_token_t tok);
@@ -63,6 +64,8 @@ static void _min_sep(minifier_t * min, ls_token_t tok);
static bool keep_rems;
static bool min_idents;
+static bool emit_ident_table;
+static bool have_ident_table;
static bool un_minify;
// --- PUBLIC FUNCTIONS --------------------------------------------------------
@@ -70,7 +73,7 @@ static bool un_minify;
int main(int argc, char ** argv)
{
int opt;
- while ((opt = getopt(argc, argv, "hiru")) != -1)
+ while ((opt = getopt(argc, argv, "hiIru")) != -1)
{
switch (opt)
{
@@ -79,6 +82,9 @@ int main(int argc, char ** argv)
exit(opt == 'h' ? EXIT_SUCCESS : EXIT_FAILURE);
break;
+ case 'I':
+ emit_ident_table = true;
+ // fall through
case 'i':
min_idents = true;
break;
@@ -103,7 +109,7 @@ int main(int argc, char ** argv)
f = explain_fopen_or_die(argv[optind], "r");
size_t sz = 512, i = 0;
- char * s = explain_malloc_or_die(sz);
+ uint8_t * s = explain_malloc_or_die(sz);
while (!feof(f))
{
@@ -137,7 +143,20 @@ int main(int argc, char ** argv)
else
f = stdout;
- _minify(&fet, f);
+ minifier_t min = {.fet = &fet};
+
+ if (un_minify)
+ have_ident_table = ls_minify_load_mit(fet.s, &fet.len);
+
+ _minify(&min, f);
+
+ if (emit_ident_table)
+ {
+ if (!min.eol_at_end)
+ fprintf(f, "\n");
+ ls_minify_emit_mit(f);
+ }
+
free(s);
if (f != stdout)
@@ -171,44 +190,45 @@ static void _usage(char const * argv0, bool short_text)
fprintf(stderr, "\n");
fprintf(stderr, " -i minify identifiers, making the code much\n");
fprintf(stderr, " less readable after un-minification\n");
+ fprintf(stderr, " -I -i, but also emit an idents table to allow\n");
+ fprintf(stderr, " full un-minification\n");
fprintf(stderr, " -r keep REM comments\n");
fprintf(stderr, " -u un-minify\n");
}
-static void _minify(file_fetcher_t * fet, FILE * f_out)
+static void _minify(minifier_t * min, FILE * f_out)
{
- minifier_t min = {.fet = fet};
ls_value_t pool[100]; // for labels
- ls_init(&min.ls, pool, sizeof pool / sizeof pool[0]);
- min.ls.fetcher = _fetcher;
- min.ls.fetcher_arg = (void *) fet;
+ ls_init(&min->ls, pool, sizeof pool / sizeof pool[0]);
+ min->ls.fetcher = _fetcher;
+ min->ls.fetcher_arg = (void *) min->fet;
- if (setjmp(min.ls._error_jmp_buf))
+ if (setjmp(min->ls._error_jmp_buf))
{
uint16_t line = 0, col = 0;
- ls_translate_pc(&min.ls, min.ls._pc, &line, &col);
+ ls_translate_pc(&min->ls, min->ls._pc, &line, &col);
fprintf(stderr, "error %d at %u:%u",
- (int) min.ls._error, line, col);
+ (int) min->ls._error, line, col);
exit(EXIT_FAILURE);
}
- min.f_out = f_out;
- min.last_tok = LS_TOK_NONE;
+ min->f_out = f_out;
+ min->last_tok = LS_TOK_NONE;
// TODO: actually collect idents and label numbers, and minify them
// too
bool done = false;
while (!done)
{
- ls_token_t tok = ls_lex(&min.ls);
+ ls_token_t tok = ls_lex(&min->ls);
- min.add_space =
- min.last_tok == LS_TOK_NUMBER ||
- min.last_tok == LS_TOK_WORD ||
- (LS_TOK_KEYWORD(min.last_tok) && un_minify) ||
- (LS_TOK_OPER(min.last_tok) && un_minify) ||
- (min.last_tok == LS_TOK_COMMA && un_minify);
+ min->add_space =
+ min->last_tok == LS_TOK_NUMBER ||
+ min->last_tok == LS_TOK_WORD ||
+ (LS_TOK_KEYWORD(min->last_tok) && un_minify) ||
+ (LS_TOK_OPER(min->last_tok) && un_minify) ||
+ (min->last_tok == LS_TOK_COMMA && un_minify);
ls_token_t tok_for_last_tok = tok;
@@ -216,45 +236,48 @@ static void _minify(file_fetcher_t * fet, FILE * f_out)
{
case LS_TOK_NUMBER:
case LS_TOK_NUM_LABEL:
- _min_number_or_num_label(&min, tok);
+ _min_number_or_num_label(min, tok);
break;
case LS_TOK_WORD:
case LS_TOK_STR_LABEL:
- _min_word_or_str_label(&min, tok);
+ _min_word_or_str_label(min, tok);
break;
case LS_TOK_STRING:
- _min_string(&min, tok);
+ _min_string(min, tok);
break;
case LS_TOK_COMMA:
- _min_comma(&min, tok);
+ _min_comma(min, tok);
break;
case LS_TOK_STATEMENT_SEP:
- _min_sep(&min, tok);
+ min->eol_at_end = true;
+ _min_sep(min, tok);
break;
case LS_TOK_INVALID:
- ls_throw_err(&min.ls, LS_SYNTAX_ERROR);
+ ls_throw_err(&min->ls, LS_SYNTAX_ERROR);
break;
default:
if (LS_TOK_KEYWORD(tok))
- tok_for_last_tok = _min_keyword(&min, tok);
+ tok_for_last_tok = _min_keyword(min, tok);
else if (LS_TOK_OPER(tok))
- _min_operator(&min, tok);
+ _min_operator(min, tok);
else
- ls_throw_err(&min.ls, LS_INTERNAL_ERROR);
+ ls_throw_err(&min->ls, LS_INTERNAL_ERROR);
break;
case LS_TOK_NONE:
done = true;
+ min->eol_at_end =
+ (min->last_tok == LS_TOK_STATEMENT_SEP);
break;
}
- min.last_tok = tok_for_last_tok;
+ min->last_tok = tok_for_last_tok;
}
}
@@ -267,7 +290,7 @@ static void _min_word_or_str_label(minifier_t * min, ls_token_t tok)
min->ls._token.word[LS_IDENT_OR_KW_LEN] = 0;
char const * ident;
- if (min_idents)
+ if (min_idents || have_ident_table)
ident = ls_minify_identgen(min->ls._token.word);
else
ident = min->ls._token.word;
diff --git a/ls_minify_identgen.c b/ls_minify_identgen.c
index a516219..bca0fa4 100644
--- a/ls_minify_identgen.c
+++ b/ls_minify_identgen.c
@@ -3,6 +3,8 @@
// --- DEPENDENCIES ------------------------------------------------------------
+#define _GNU_SOURCE // memmem
+
// This module
#include "ls_minify_identgen.h"
@@ -12,7 +14,7 @@
// External dependencies
#include <uthash.h>
-#include <libexplain/malloc.h>
+#include <libexplain/calloc.h>
// Standard headers
#include <stdbool.h>
@@ -36,6 +38,12 @@ static const char _identchars[] =
#define RADIX_INITIAL 53 // a-z A-Z _
#define RADIX_REST 63 // a-z A-Z _ 0-9
+static const uint8_t magic[4] = {
+ LS_KW_REM, ((uint8_t) 'M') | 0x80,
+ ((uint8_t) 'I') | 0x80,
+ ((uint8_t) 'T') | 0x80
+};
+
// --- PRIVATE FUNCTION PROTOTYPES ---------------------------------------------
/// Get the next unique identifier.
@@ -48,6 +56,7 @@ static bool _get_next_ident(char * ident);
// --- PRIVATE VARIABLES -------------------------------------------------------
static ident_t * _idents = NULL;
+static uint32_t _last_ident = 0;
// --- PUBLIC FUNCTIONS --------------------------------------------------------
@@ -65,7 +74,7 @@ char const * ls_minify_identgen(char const * ident)
return s->short_ident;
}
- s = explain_malloc_or_die(sizeof(ident_t));
+ s = explain_calloc_or_die(1, sizeof(ident_t));
strncpy(s->long_ident, ident, LS_IDENT_LEN);
s->long_ident[LS_IDENT_LEN] = 0;
bool success = _get_next_ident(s->short_ident);
@@ -79,6 +88,69 @@ char const * ls_minify_identgen(char const * ident)
return s->short_ident;
}
+void ls_minify_emit_mit(FILE * f)
+{
+ fwrite(magic, 1, sizeof(magic), f);
+
+ // it's already in the right order let's goooo
+ for (ident_t * i = _idents; i; i = i->hh.next)
+ {
+ uint8_t id[LS_IDENT_LEN + 1] = {0};
+ for (size_t j = 0; j < LS_IDENT_LEN; j++)
+ id[j] = (uint8_t) i->long_ident[j];
+ id[0] |= 0x80;
+ fwrite(id, 1, strnlen(i->long_ident, LS_IDENT_LEN), f);
+ }
+}
+
+bool ls_minify_load_mit(uint8_t const * code, size_t * plen)
+{
+ // TODO: memmem isn't standard, but most libcs (including avr) seem
+ // to have it nowadays. If anybody complains just reimplement.
+ uint8_t const * pmagic =
+ (uint8_t const *) memmem(code, *plen, magic, sizeof(magic));
+
+ if (!pmagic)
+ return false;
+
+ if (pmagic == code)
+ return false;
+
+ size_t len = *plen;
+ *plen = (size_t)(pmagic - code);
+
+ ident_t * each = NULL;
+ size_t const start = (size_t)(pmagic - code) + sizeof(magic);
+ size_t j = 0;
+ _idents = NULL;
+
+ for (size_t i = start; i < len; i++)
+ {
+ if (code[i] & 0x80)
+ {
+ if (each)
+ {
+ _get_next_ident(each->long_ident);
+ HASH_ADD_STR(_idents, long_ident, each);
+ }
+ each = explain_calloc_or_die(1, sizeof(ident_t));
+ j = 0;
+ }
+
+ if (j <= LS_IDENT_LEN)
+ each->short_ident[j++] = code[i] & 0x7F;
+ }
+
+ // TODO refactor, this duplicates the bit in the loop body
+ if (each)
+ {
+ _get_next_ident(each->long_ident);
+ HASH_ADD_STR(_idents, long_ident, each);
+ }
+
+ return true;
+}
+
// --- PRIVATE FUNCTION DEFINITIONS --------------------------------------------
static bool _get_next_ident(char * ident)
@@ -88,7 +160,6 @@ static bool _get_next_ident(char * ident)
// very brain and i don't see how to fix it. so you only get like 48
// billion idents instead of 52 billion. meh.
static bool _out_of_idents = false;
- static uint32_t _last_ident = 0;
// It's possible that the ident we generate will be a valid keyword,
// so just try to convert it.
diff --git a/ls_minify_identgen.h b/ls_minify_identgen.h
index d1c6337..27272bb 100644
--- a/ls_minify_identgen.h
+++ b/ls_minify_identgen.h
@@ -12,6 +12,7 @@
#include <stdbool.h>
#include <stddef.h>
#include <inttypes.h>
+#include <stdio.h>
// --- PUBLIC MACROS -----------------------------------------------------------
// --- PRIVATE DATATYPES -------------------------------------------------------
@@ -25,4 +26,25 @@
/// return the same value.
char const * ls_minify_identgen(char const * ident);
+/// Emit an idents table for later un-minification
+///
+/// This emits a long special REM-comment. This starts with an unmistakable
+/// magic number consisting of "MIT" (Minifier Ident Table) with each byte
+/// or'd with 0x80. Then, each ident is emitted in the order they were
+/// originally passed to ls_minify_identgen(), delimited by 0x80-or'ing the
+/// first byte of each.
+void ls_minify_emit_mit(FILE * f);
+
+/// Load an idents table emitted by ls_minify_emit_mit().
+///
+/// Loads the internal hash table "backwards" so that ls_minify_identgen()
+/// will un-minify.
+///
+/// @param code - source code to search and load from
+/// @param plen - length of code. Will be modified to the length of the code
+/// up to but not including the MIT.
+///
+/// @return whether a table was found
+bool ls_minify_load_mit(uint8_t const * code, size_t * plen);
+
#endif // !defined(LS_MINIFY_IDENTGEN_H)