Re: Generate text, given a regex

Gene <gene.ressler@gmail.com>
Fri, 11 Apr 2008 13:23:32 -0700 (PDT)

          From comp.compilers

Related articles
Generate text, given a regex midhatali@gmail.com (Midhat) (2008-03-23)
Re: Generate text, given a regex m.helvensteijn@gmail.com (2008-03-27)
Re: Generate text, given a regex gene.ressler@gmail.com (Gene) (2008-03-26)
Generate text, given a regex domenico.bianculli@lu.unisi.ch (Domenico Bianculli) (2008-03-27)
Re: Generate text, given a regex gene.ressler@gmail.com (Gene) (2008-04-11)
Re: Generate text, given a regex rsc@swtch.com (Russ Cox) (2008-04-11)
| List of all articles for this month |

From: Gene <gene.ressler@gmail.com>
Newsgroups: comp.compilers
Date: Fri, 11 Apr 2008 13:23:32 -0700 (PDT)
Organization: Compilers Central
References: 08-03-095 08-03-102
Keywords: parse, testing
Posted-Date: 11 Apr 2008 16:48:58 EDT

On Mar 27, 12:15 am, Gene <gene.ress...@gmail.com> wrote:
> On Mar 24, 2:48 am, Midhat <midhat...@gmail.com> wrote:> Hi. I want to
generate text based on a given regex. just any text that
> > satisifies the regex. Are there any existing tools/libraries to do
> > that.
> > [The harder question of coming up with strings that satisfy a
> > context free grammar has been discussed at length. See for
> > example http://compilers.iecc.com/comparch/article/91-04-055 -John]
>
> Wow. I really admire John's memory, having posted on the CFG topic 17
> years ago. It would be a simple matter to convert a regex to a
> regular grammar. The Common Lisp code in the last article would then
> print all the strings given by the regex in increasing order of
> length.
> [Hey, I have this search engine on the archives, you know. -John]


Well, you have to remember what to search for! I'm still admiring.


Nonetheless, it's an interesting exercise. Will probably use it in a
data structures course some time in the future. If you want to
implement the "obvious" recursive solution and still generate every
possible string, you effectively need coroutines to do alternation.
Domenico's tool (at least the on-line "tryit" page) seems e.g. to have
trouble with a*|b*. On the other hand, if you convert the regex to a
finite automaton, you can generate strings in strictly increasing
length by what amounts to breadth first search. The most efficient
would be a DFA, but you can make do with an NFA. Here's an idea.
Sorry the code is long:


(defun parse-re (s)
    "Parse a regex given as a string into a lispish form by recursive
      descent. Underscore is empty string."
    ;; Recognizing +, ?, [], [^ ], etc. is left as an exercise.
    (let ((i 0))
        (labels ((peek () (if (>= i (length s)) :end-input (char s i)))
(advance () (incf i))
(next () (prog1 (peek) (advance)))
(expr ()
(do ((rtn (term)))
((not (eql (peek) #\|)) rtn)
(advance)
(setq rtn `(or ,rtn ,(term)))))
(term ()
(do ((rtn (star)))
((member (peek) '(#\) #\| #\* :end-input)) rtn)
(setq rtn `(and ,rtn ,(star)))))
(star ()
(do ((rtn (factor)))
((not (eql (peek) #\*)) rtn)
(advance)
(setq rtn `(* ,rtn))))
(factor ()
(case (peek)
(#\(
(advance)
(prog1 (expr)
(unless (eql (next) #\) )
(error "unmatched paren"))))
(#\_
(advance)
nil)
(t (next)))))
            (expr))))


;;; State machine structures.
(defstruct state in out accept-p)
(defstruct transition from on to)
(defstruct fa start stop states transitions)


(defun transition (from to &optional on)
    "Make a new transition and link to corresponding states."
    (let ((x (make-transition :from from :to to :on on)))
        (push x (state-in to))
        (push x (state-out from))
        x))


(defun re-to-nfa (re)
    "Use Thompson's construction to make a NFA for a regex."
    ;; Lots of optimizations possible.
    (cond ((or (null re) (characterp re))
(let* ((start (make-state))
(stop (make-state))
(x (transition start stop re)))
(make-fa :start start :stop stop
:states (list start stop)
:transitions (list x))))
((eq 'and (first re))
(let* ((fa-1 (re-to-nfa (second re)))
(fa-2 (re-to-nfa (third re)))
(x (transition (fa-stop fa-1) (fa-start fa-2))))
(make-fa :start (fa-start fa-1) :stop (fa-stop fa-2)
:states (append (fa-states fa-1) (fa-states fa-2))
:transitions (append (fa-transitions fa-1)
(fa-transitions fa-2)
(list x)))))
((eq 'or (first re))
(let* ((fa-1 (re-to-nfa (second re)))
(fa-2 (re-to-nfa (third re)))
(a (make-state))
(b (make-state))
(x1 (transition a (fa-start fa-1)))
(x2 (transition a (fa-start fa-2)))
(y1 (transition (fa-stop fa-1) b))
(y2 (transition (fa-stop fa-2) b)))
(make-fa :start a :stop b
:states (append (fa-states fa-1)
(fa-states fa-2)
(list a b))
:transitions (append (fa-transitions fa-1)
(fa-transitions fa-2)
(list x1 x2 y1 y2)))))
((eq '* (first re))
(let* ((fa (re-to-nfa (second re)))
(a (make-state))
(b (make-state))
(x0 (transition a b))
(x1 (transition a (fa-start fa)))
(x2 (transition (fa-stop fa) b))
(y (transition (fa-stop fa) (fa-start fa))))
(make-fa :start a :stop b
:states (append (fa-states fa) (list a b))
:transitions (append (fa-transitions fa)
(list x0 x1 x2 y)))))
(t (error "unrecognized re ~a" re))))


;;; A token is assigned to some state and carries along a string of
all
;;; characters of transitions the token has made so far. We use a
;;; list representation so that hash tables with :test #'equal will
;;; store uniquified sets of tokens, which are just mapped to T.
(defstruct (token (:type list)) state (str ""))


(defun mark-accepting (nfa)
    "Mark as accepting all states that can reach the stop state of the
      nfa through epsilon transitions. Return the modified nfa."
    ;; Algorithm is DFS from the stop state backward.
    (labels ((mark (stop visited)
(unless (member stop visited)
(setf (state-accept-p stop) t)
(dolist (x (state-in stop))
(when (null (transition-on x))
(mark (transition-from x) (cons stop visited)))))))
        (mark (fa-stop nfa) nil))
    nfa)


(defun updated-token (org-token x)
    "From an original token and a transition, build a new token with
      string updated by appending the transition character."
    (make-token
      :state (transition-to x)
      :str (format nil "~a~a" (token-str org-token) (transition-on x))))


(defun advance-1 (token new-tokens)
    "Advance just one token. The token may be cloned due to multiple
      epsilon or duplicate character transitions out of the same state.
      If advancing takes a token to an accepting state, print its
      string to standard output."
    (labels ((epsilon-close (state visited)
(unless (member state visited)
(dolist (x (state-out state))
(if (null (transition-on x)) ; epsilon transition
(epsilon-close (transition-to x) (cons state visited))
(let ((new-token (updated-token token x)))
(setf (gethash new-token new-tokens) t)
(when (state-accept-p (token-state new-token))
(format t "~%~a" (token-str new-token)))))))))
        (epsilon-close (token-state token) nil)))


(defun advance (tokens)
    "Advance all the tokens in a map. If a token reaches an accepting
      state, print the string it carries to standard output."
    (let ((new-tokens (make-hash-table :test #'equal)))
        (maphash #'(lambda (token true)
(declare (ignore true))
(advance-1 token new-tokens)) tokens)
        new-tokens))


(defun initial-tokens (fa)
    "Return a token set hash with a token for the start state of the
      given fa already inserted."
    (let* ((tokens (make-hash-table :test #'equal))
(init-token (make-token :state (fa-start fa))))
        (setf (gethash init-token tokens) t)
        tokens))


(defun gen (re-as-string &optional length-limit)
    "Generate strings due to the NFA."
    (do* ((re (parse-re re-as-string))
(nfa (mark-accepting (re-to-nfa re)))
(tokens (advance (initial-tokens nfa)) (advance tokens))
(n 1 (1+ n)))
              ((or (zerop (hash-table-count tokens))
(and length-limit (>= n length-limit)))
(values))))


(defun test (&optional (re "(a*|bcd)*") (length-limit 7))
    (format t "Results of (gen ~s ~s):" re length-limit);
    (gen re length-limit))


CL-USER> (test)
Results of (gen "(a*|bcd)*" 7):
a
aa
aa
bcd
aaa
aaa
aaaa
aaaa
abcd
bcda
bcdaa
bcdaa
abcda
aaaaa
aaaaa
aabcd
aabcda
aaaaaa
aaaaaa
aaabcd
abcdaa
abcdaa
bcdbcd
bcdaaa
bcdaaa
bcdaaaa
bcdaaaa
bcdabcd
bcdbcda
abcdbcd
abcdaaa
abcdaaa
aaabcda
aaaaaaa
aaaaaaa
aaaabcd
aabcdaa
aabcdaa
; No value
CL-USER>



Post a followup to this message

Return to the comp.compilers page.
Search the comp.compilers archives again.