keyword lookup (Re: Hash specifics)

megatest!djones@decwrl.dec.com (Dave Jones)
20 Dec 90 00:18:15 GMT

          From comp.compilers

Related articles
keyword lookup (Re: Hash specifics) megatest!djones@decwrl.dec.com (1990-12-20)
| List of all articles for this month |
Newsgroups: comp.compilers
From: megatest!djones@decwrl.dec.com (Dave Jones)
Keywords: design
Organization: Megatest Corporation, San Jose, Ca
References: <9012111913.AA07310@cs-sun-fsa.cpsc.ucalgary.ca>
Date: 20 Dec 90 00:18:15 GMT

> I want to hash the one hundred + opcodes for the Intel 386 processor
> into the smallest possible table with the fewest collisions.


Rather than using a hash-table, why not use a nested, string-matching
switch-statement? (Well, if you are strapped for memory, that's one reason
why not.) Anyway, that's what I did for the Pascal compiler I just completed.
(Because it is to be used as part of an interactive debugger, the overriding
consideration was speed of compilation.) Needless to say, I did not write the
switch by hand (shudder). I wrote a little program to write the switch. I am
enclosing it here. I am not posting it to a sources-group because it needs a
little work to make it more presentable. Specifically, the way it "knows" the
keyword/token-value mapping is a kluge: The pairs are compiled right into the
program by means of an include-file. (The ones included here are for Pascal).
They should instead be read from a file, so the program does not need to be
recompiled on every change to the keyword map. Indeed, the way it is set up,
you may want to hack the source to get exactly what your want. I've put in
some comments to guide. If you want to polish this up a bit and post it to a
sources-group, please feel free.


One more thing: There are some pretty good lex-substitutes floating around,
or so I've heard. You might find that if you give one of those programs only
a list of keywords, you may get back a super-efficient switch much like the
one this program generates.


Okay, yet one more thing: If you have a lot of keywords, and you compile this
on an old, brain-damaged BSD compiler, remember to use the -J flag, which
means, "Do not generate pseudo-random jumps." :-(
[I'd be interested to hear some speed comparisons with a flex lexer would
be interesting. A quick look at the code for both doesn't make it obvious
which would be faster. -John]


snip ^^^ snip ^^^ snip ^^^ snip ^^^ snip ^^^ snip ^^^ snip ^^^ snip ^^^
#! /bin/sh
# This is a shell archive. Remove anything before this line, then unpack
# it by saving it into a file and typing "sh file". To overwrite existing
# files, type "sh file -c". You can also feed this as standard input via
# unshar, or by typing "sh <file", e.g.. If this archive is complete, you
# will see the following message at the end:
# "End of shell archive."
# Contents: keyfinder.c keywords.h
# Wrapped by djones@goofy on Wed Dec 19 16:14:36 1990
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
if test -f keyfinder.c -a "${1}" != "-c" ; then
    echo shar: Will not over-write existing file \"keyfinder.c\"
else
echo shar: Extracting \"keyfinder.c\" \(3780 characters\)
sed "s/^X//" >keyfinder.c <<'END_OF_keyfinder.c'
X#include <stdio.h>
X#define print (void)printf
X
Xchar* Emalloc();
X
Xtypedef struct {
X char *string; /* keyword string to be matched */
X char *value; /* value to return on match */
X}Keyword_def ;
X
X/*
X * Kluge to set up keyword/value pairs:
X * compile them into the program. It would be
X * better, probably, if we read them from a file.
X */
XKeyword_def init_keys[] =
X{
X#include "keywords.h"
X , 0, 0
X};
X
Xmain()
X{
X
X /*
X * In the resulting switch, the macro EOSTR(chr,len)
X * calculates whether or not the scan of [len]
X * chars, ending looking at [chr] has inspected the whole string.
X * Thus if the string is null-terminated, you use this (default):
X *
X * #define EOSTR(chr,len) ((chr) == 0)
X *
X * Using counted strings, you can substitute a macro that compares [len]
X * against the known length of the string.
X *
X * #define EOSTR(chr,len) ((len) == known_len)
X *
X * Another possibility would be ...
X *
X * #define EOSTR(chr,len) (!isalnum(chr))
X *
X */
X
X print("#ifndef EOSTR\n# define EOSTR(chr,length) ((chr)==0)\n#endif\n");
X
X /* We generate a function called, "keyis()".
X */
X print("keyis(str)\n char* str;\n{\n");
X
X gen_switch("");
X
X /* In the resulting function, if no match is found, return -1.
X */
X print(" return -1;\n}\n");
X
X if(ferror(stdout)) {
X perror("stdout");
X exit(-1);
X }
X}
X
X
Xchar*
Xextend(str, ch)
X char* str;
X char ch;
X{
X int len = strlen(str);
X char* retval = Emalloc(len+2);
X strcpy(retval, str);
X retval[len] = ch;
X retval[len+1] = '\0';
X return retval;
X}
X
Xchar*
Xfind(str)
X char* str;
X{
X Keyword_def *key = init_keys;
X
X for(;key->string;key++) {
X if(strcmp(str, key->string)==0)
X return key->value;
X }
X
X return 0;
X
X}
X
X/*
X * Generate a switch under the assumption that
X * the chars in [str] have already been matched.
X */
Xgen_switch(str)
X char* str;
X{
X int index = strlen(str);
X char done[128];
X
X { int i;
X for(i =0; i<128; i++) /* 128 == number of chars. (ASCII specific) */
X done[i] = 0;
X }
X
X { Keyword_def *key = init_keys;
X char* value;
X int do_switch;
X
X /* If there is only one possible case, no need to generate
X * a switch.
X */
X
X do_switch = cases(str);
X
X if(do_switch) {
X
X blanks(3*index);
X print("switch(*str++) /* %s */\n", str);
X blanks(3*index);
X print("{ default: ");
X
X if(value = find(str))
X print("if(EOSTR(*str,%d)) return %s; ", index, value);
X
X print("break;\n");
X
X for(;key->string;key++) {
X char ch;
X
X if(index<strlen(key->string))
X ch = key->string[index];
X else
X ch = 0;
X
X if(ch) {
X if(!match(key->string, str) != 0)
X continue;
X
X if(done[ch])
X continue;
X else
X done[ch] = 1;
X
X blanks(3*index + 2);
X print("case '%c': /* %s%c */\n", ch,str,ch);
X gen_switch(extend(str), ch);
X blanks(3*index + 2);
X print("break;\n");
X }
X }
X
X blanks(3*index);
X print("}\n");
X }
X else {
X /* One case only. */
X blanks(3*index);
X if(value = find(str))
X print("if(EOSTR(*str,%d)) return %s;\n", index, value);
X }
X }
X}
X
Xcases(str)
X char* str;
X{
X int index = strlen(str);
X Keyword_def *key = init_keys;
X
X for(;key->string;key++) {
X char ch;
X
X if(index<strlen(key->string))
X ch = key->string[index];
X else
X ch = 0;
X
X if(ch) {
X if(!match(key->string, str) != 0)
X continue;
X
X return 1;
X }
X }
X return 0;
X}
X
Xblanks(i)
X{
X print(" ");
X while(i--)
X (void)putchar(' ');
X}
X
Xextern char* malloc();
X
Xchar* Emalloc(size)
X{
X char *retval = malloc((unsigned)size);
X
X if(retval == 0) {
X perror("malloc()");
X exit(-1);
X }
X return retval;
X}
X
Xmatch(key, str)
X char* key;
X char* str;
X{
X return strncmp(key,str, strlen(str)) == 0;
X}
END_OF_keyfinder.c
if test 3780 -ne `wc -c <keyfinder.c`; then
        echo shar: \"keyfinder.c\" unpacked with wrong size!
fi
# end of overwriting check
fi
if test -f keywords.h -a "${1}" != "-c" ; then
    echo shar: Will not over-write existing file \"keywords.h\"
else
echo shar: Extracting \"keywords.h\" \(743 characters\)
sed "s/^X//" >keywords.h <<'END_OF_keywords.h'
X
X "and", "AND"
X, "array", "ARRAY"
X, "begin", "BEGIN"
X, "case", "CASE"
X, "const", "CONST"
X, "div", "DIV"
X, "do", "DO"
X, "downto", "DOWNTO"
X, "else", "ELSE"
X, "end", "END"
X, "external", "EXTERNAL"
X, "file", "FILEX"
X, "for", "FOR"
X, "forward", "FORWARD"
X, "function", "FUNCTION"
X, "goto", "GOTO"
X, "if", "IF"
X, "in", "IN"
X, "insert", "INSERT"
X, "label", "LABEL"
X, "mod", "MOD"
X, "nil", "NIL"
X, "not", "NOT"
X, "of", "OF"
X, "or", "OR"
X, "others", "OTHERS"
X, "packed", "PACKED"
X, "procedure", "PROCEDURE"
X, "program", "PROGRAM"
X, "record", "RECORD"
X, "repeat", "REPEAT"
X, "set", "SET"
X, "then", "THEN"
X, "to", "TO"
X, "type", "TYPE"
X, "until", "UNTIL"
X, "var", "VAR"
X, "while", "WHILE"
X, "with", "WITH"
END_OF_keywords.h
if test 743 -ne `wc -c <keywords.h`; then
        echo shar: \"keywords.h\" unpacked with wrong size!
fi
# end of overwriting check
fi
echo shar: End of shell archive.
exit 0
--


Post a followup to this message

Return to the comp.compilers page.
Search the comp.compilers archives again.