Related articles |
---|
html and flex deleveld@dds.nl (1999-01-25) |
Re: html and flex rogerb@sco.COM (1999-01-27) |
Re: html and flex mikee@cetasoft.cog (1999-01-27) |
Re: html and flex deleveld@dds.nl (1999-02-03) |
From: | deleveld@dds.nl |
Newsgroups: | comp.compilers |
Date: | 3 Feb 1999 23:57:22 -0500 |
Organization: | Compilers Central |
References: | 99-01-093 99-01-106 |
Keywords: | lex, WWW |
On 25 Jan 1999 21:53:44 -0500, deleveld@dds.nl wrote:
> [ want advice on lexing and parsing HTML ]
mikee@cetasoft.cog (Mike Enright) wrote:
> Many browsers (both browsers?) allow broken HTML to work. Therefore to
> read those same pages, you will have to accept code that doesn't match
> the specs, like <A HREF="somewhere.html> (missing quote within a tag)
> or #include <stdio.h> (missing semicolon after <). Maybe a
> lexer can be written in lex that deals with such things. My feeling is
> it would be a large grammar if it worked.
Yes, thank you for the advice. I had expected html to be somewhat standardized
but looking at some pages I got from the web, I see that there is a lot of bat
html code around. I'll do my best to get around any mistakes.
Anyway, if anyone is interested here is the flex file for the simple scanning
until now...
%{
#include <string.h>
#include "v_html.h"
%}
%option noyywrap
%option never-interactive
%option yyclass="HtmlLexer"
/* States */
%s INBODY
%s INHEAD
/* Basic Tag related */
HTML "<html>"
OUTHTML "</html>"
HEAD "<head>"
OUTHEAD "</head>"
BODY "<body>"
OUTBODY "</body>"
/* Link related */
HYPERLINK "<a"[ ]+"href"[ ]*"="[ ]*"\""[^#][^>]*">"
PAGELINK "<a"[ ]+"href"[ ]*"="[ ]*"\"#"[^>]*">"
PAGETARGET "<a"[ ]+"name"[^>]*">"
LINKEND "</a>"
/* List related */
UL "<ul>"
OUTUL "</ul>"
LI "<li>"
OUTLI "</li>"
/* Spacing related */
PARAGRAPH "<p>"
LINEBREAK "<br>"
PRE "<pre>"
OUTPRE "</pre>"
/* Ignore unknown tags */
UNKNOWNTAG "<"[^>]*">"
/* Special characters */
NBSP " "
GREATERTHAN ">"
LESSTHAN "<"
/* Printable stuff */
WHITESPACE [ \t]+
RETURN [\n]+
TEXTWORD [^ \t\n<>&]+
%%
{HTML} { // Basic Tag related -----------------------------------------------
InHtml = 1;
}
{OUTHTML} {
InHtml = 0;
}
{HEAD} {
if(InHtml)
BEGIN(INHEAD);
}
<INHEAD>{OUTHEAD} {
BEGIN(INITIAL);
}
{BODY} {
if(InHtml)
BEGIN(INBODY);
}
<INBODY>{OUTBODY} {
BEGIN(INITIAL);
}
<INBODY>{UL} { // List related ----------------------------------------------
ListLevel++;
Print("\n");
LineCount++;
WidthCount = 0;
}
<INBODY>{OUTUL} {
if(ListLevel)
ListLevel--;
Print("\n");
LineCount++;
WidthCount = 0;
}
<INBODY>{LI} {
if(InListElem)
{
Print("\n");
LineCount++;
WidthCount = 0;
}
InListElem = true;
for(int i=0;i<ListLevel;i++)
{
Print(" ");
WidthCount += Width(" ");
}
Print("* ");
WidthCount += Width("* ");
}
<INBODY>{OUTLI} {
Print("\n");
LineCount++;
WidthCount = 0;
InListElem = false;
}
<INBODY>{LINEBREAK} { // Spacing related ------------------------------------
Print("\n");
LineCount++;
WidthCount = 0;
}
<INBODY>{PARAGRAPH} {
Print("\n\n");
LineCount += 2;
WidthCount = 0;
}
<INBODY>{PRE} {
InPreformat = 1;
}
<INBODY>{OUTPRE} {
InPreformat = 0;
}
<INBODY>{HYPERLINK} { // Link related ---------------------------------------
cout << "HLink: " << YYText() << endl;
// Setup a link to be saved
free(LastLinkDest);
LastLinkDest = strdup(YYText());
LastLinkBegin = WidthCount;
LastLinkLine = LineCount;
InLinkedText = 1;
}
<INBODY>{PAGELINK} {
cout << "PLink: " << YYText() << endl;
// Setup a link to be saved
if(LastLinkDest)
free(LastLinkDest);
LastLinkDest = strdup(YYText());
LastLinkBegin = WidthCount;
LastLinkLine = LineCount;
InLinkedText = 1;
}
<INBODY>{PAGETARGET} {
cout << "Page: " << YYText() << endl;
InLinkedText = 1;
}
<INBODY>{LINKEND} {
InLinkedText = 0;
// The link should be done now
if(LastLinkDest)
{
HtmlLink NewLink(LastLinkDest,LastLinkLine,LastLinkBegin,WidthCount);
Link.append(NewLink);
}
free(LastLinkDest);
}
{UNKNOWNTAG} { // Ignore unknown tags ---------------------------------------
if(InHtml)
cout << "Tag: " << YYText() << endl;
}
<INBODY>{NBSP} { // Special characters --------------------------------------
WrapPrint(" ");
}
<INBODY>{GREATERTHAN} {
WrapPrint(">");
}
<INBODY>{LESSTHAN} {
WrapPrint("<");
}
<INBODY>{WHITESPACE} { // Printable -----------------------------------------
if(InPreformat)
{
Print(YYText());
WidthCount += Width(YYText());
}
else
{
WrapPrint(" ");
}
}
<INBODY>{RETURN} {
if(InPreformat)
{
Print(YYText());
LineCount += strlen(YYText());
WidthCount = 0;
}
else
{
// Print(" ");
// LineCount++;
// WidthCount = 0;
}
}
<INBODY>{TEXTWORD} {
if(InPreformat)
{
Print(YYText());
WidthCount += Width(YYText());
}
else
WrapPrint(YYText());
}
<INBODY>. { // Unknown text in body
if(InPreformat)
{
Print(YYText());
WidthCount += Width(YYText());
}
else
WrapPrint(YYText());
}
<*>. { // Ignore totally unknown text
}
%%
Return to the
comp.compilers page.
Search the
comp.compilers archives again.