/* $Cambridge: hermes/src/prayer/session/html_secure_tidy.c,v 1.14 2010/07/20 09:47:53 dpc22 Exp $ */

/* Clean up HTML using tidy library. Drop in replacement for html_secure */

#include "prayer_session.h"

#ifdef TIDY_ENABLE

/* Need /usr/include/tidy on include path */
#include <tidy.h>
#include <buffio.h>

/* ====================================================================== */

/* A couple of static utility functions to quietly unpick CSS character
   entities, which are \<hex sequence> up to six digits long. Has to be
   exactly six characters long (pad with leading zeroes) if following
   character is another valid hex digit, which is reason for tmp buffer */

static void
unquote_css_inline(char *s)
{
    char *d = s;
    char *t;
    unsigned long unicode;
    char tmp[7];

    while (*s) {
        if (*s == '\\') {
            s++;

            /* Easiest way to isolate hex string from zero to six chars */
            strncpy(tmp, s, 6);
            tmp[6] = '\0';
            unicode = strtoul(tmp, &t, 16);
            *t = '\0';

            s += strlen(tmp);
            if ((unicode > 0) && (unicode < 128))
                *d++ = (char)(unicode);  /* Caller only cares about ASCII */
        } else if (d < s) {
            *d++ = *s++;
        } else {
            d++;
            s++;
        }
    }
    if (d < s)
        *d = '\0';
}

static char *
unquote_css_entities(char *s)
{
    static char *buf = NULL;
    unsigned long alloc = 0;
    unsigned long len = strlen(s)+1;

    if (alloc < len) {
        alloc = (len > 4096) ? (len*2) : 4096;
        if (buf)
            buf = realloc(buf, alloc);   /* Always allows buf==NULL? */
        else
            buf = malloc(alloc);

        if (!buf)
            log_panic("Out of memory");
    }
    strcpy(buf, s);

    unquote_css_inline(buf);

    return(buf);
}

/* ====================================================================== */

/* Foul layering volation: Tidy doesn't export these functions */

extern void prvTidyDiscardElement( TidyDoc doc, TidyNode node);
extern void prvTidyRemoveAttribute( TidyDoc doc, TidyNode node, TidyAttr attr);
extern void prvTidyAddAttribute( TidyDoc doc, TidyNode node,
                                 const char *attr, const char *value);

static BOOL test_element_allowed(const char *name, BOOL show_images)
{
    char *array[] = {
        "script",
        "app",
        "applet",
        "server",
        "object",
        "html",
        "head",
        "body",
        "meta",
        "title",
        "frame",
        "link",
        "iframe",
        "embed",
        "xml",
        "form",
        "input",
        NULL
    };
    int i;

    if (!name)
        return(NIL);

    if (!show_images && !strcasecmp(name, "img"))
        return(NIL);

    /* Binary chop would be faster. Quick test shows <1% of time spent here */
    for (i=0; array[i]; i++) {
        if (!strcasecmp(array[i], name)) {
            return(NIL);
        }
    }

    return(T);
}

/* Following is strcasecmp, allowing for whitespace around ':' or '(' */

static BOOL
mymatch (char *haystack, char *needle)
{
    char c, *p, *np = NULL;

    for (p = haystack; *p; p++) {
        if (np) {
            /* ':' in middle of needle matches \s*:\s* in haystack */
            if ((*np == ':') || (*np == '(')) {
                c = *np++;
                while (*p && Uisspace(*p))
                    p++;
                if (*p == c) {
                    if (*np == '\0')
                        return T;

                    /* Skip over trailing space, but allow for p++ in loop */
                    p++;
                    while (*p && Uisspace(*p))
                        p++;
                    p--;
                } else
                    np = NULL;
            } else if (Utoupper(*p) == Utoupper(*np)) {
                np++;
                if (*np == '\0')
                    return T;
            } else
                np = NULL;
        } else if (Utoupper(*p) == Utoupper(*needle)) {
            np = needle + 1;
        }
    }
    return NIL;
}

static BOOL test_style_allowed(const char *value)
{
    char *s = (char *)value;

    if (!(s && s[0]))
        return(NIL);
    
    if (strchr(s, '\\'))
        s = unquote_css_entities(s);
    
    if (mymatch(s, "expression("))
        return(NIL);

    if (mymatch(s, "javascript:"))
        return(NIL);

    if (mymatch(s, "background-image:url("))
        return(NIL);

    if (mymatch(s, "content:url("))
        return(NIL);

    if (mymatch(s, "behaviour:url("))
        return(NIL);

    if (mymatch(s, "url("))
        return(NIL);

    return(T);
}

static BOOL test_href_allowed(const char *href)
{
    char *array[] = {
        "http://",
        "https://",
        "ftp:",
        "wais:",
        "telnet:",
        "cid:",
        "#",
        NULL
    };
    int i;

    if (!href)
        return(NIL);

    for (i=0; array[i]; i++) {
        if (!strncasecmp(href, array[i], strlen(array[i])))
            return(T);
    }
    return(NIL);
}

static BOOL test_href_needs_blank(const char *href)
{
    char *array[] = {
        "http://",
        "https://",
        NULL
    };
    int i;

    if (!href)
        return(NIL);

    for (i=0; array[i]; i++) {
        if (!strncasecmp(href, array[i], strlen(array[i])))
            return(T);
    }
    return(NIL);
}


static BOOL test_attribute_allowed(const char *name, const char *value)
{
    char *array[] = {
        "target",
        "code",
        "codepage",
        "codetype",
        "language",
        NULL
    };
    int i;

    if (!name)
        return(NIL);

    /* includes onload and onmouseover */
    if (!strncasecmp(name, "on", 2)) 
        return(NIL);

    if (!strcasecmp(name, "href") && value)
        return(test_href_allowed(value));

    if (!strcasecmp(name, "style") && !test_style_allowed(value))
        return(NIL);

    for (i=0; array[i]; i++) {
        if (!strcasecmp(array[i], name))
            return(NIL);
    }
    return(T);
}

static void tidy_tree(TidyNode tnod, TidyDoc tdoc, BOOL show_images)
{
    TidyNode child, next_child;
    TidyAttr attr, nattr;
    BOOL href_needs_blank = NIL;

    if (!tnod)
        return;

    for ( child = tidyGetChild(tnod); child; child = next_child ) {
        ctmbstr name;

        next_child = tidyGetNext(child);

        switch ( tidyNodeGetType(child) ) {
        case TidyNode_Root:
        case TidyNode_DocType:
        case TidyNode_Comment:
        case TidyNode_ProcIns:
        case TidyNode_Text:
        case TidyNode_CDATA:
        case TidyNode_Section:
        case TidyNode_Asp:
        case TidyNode_Jste:
        case TidyNode_Php:
        case TidyNode_XmlDecl:
            name = "TidyNode";
            break;
        case TidyNode_Start:
        case TidyNode_End:
        case TidyNode_StartEnd:
        default:
            name = tidyNodeGetName( child );
            break;
        }

        if (!test_element_allowed(name, show_images)) {
            prvTidyDiscardElement(tdoc, child);
            continue;
        }

        href_needs_blank = NIL;
        for (attr=tidyAttrFirst(child); attr; attr=nattr) {
            ctmbstr atname  = tidyAttrName(attr);
            ctmbstr atvalue = tidyAttrValue(attr);
            nattr = tidyAttrNext(attr);  /* attr Might be removed */

            if (!test_attribute_allowed(atname, atvalue)) {
                prvTidyRemoveAttribute(tdoc, child, attr);
            } else if (!strcmp(atname, "href")) {
                href_needs_blank = test_href_needs_blank(atvalue);
            }
        }

        if (!strcasecmp(name, "a") && href_needs_blank) {
            prvTidyAddAttribute(tdoc, child, "target", "_blank" );
        }

        tidy_tree( child, tdoc, show_images );
    }
}

BOOL html_secure_tidy(struct session *session, struct buffer *b,
                      BOOL show_images, char *input)
{
    TidyBuffer errbuf = {0};
    TidyBuffer output = {0};
    int rc = -1;
    TidyDoc tdoc = tidyCreate();
    TidyNode tnod;
    char *s;

    rc = tidyOptSetBool( tdoc, TidyXhtmlOut, yes ) ? 0 : -1;
    if ( rc >= 0 )
        rc = tidySetErrorBuffer( tdoc, &errbuf );
    if ( rc >= 0 )
        rc = tidySetCharEncoding(tdoc, "utf8" );
    if ( rc >= 0 )
        rc = (tidyOptSetInt(tdoc, TidyBodyOnly, 1) ? rc : -1);

    if ( rc >= 0 )
        rc = tidyParseString( tdoc, input );

    if ( rc >= 0 ) {
        rc = tidyCleanAndRepair( tdoc );
    }
    if ( rc >= 0 )
        rc = tidyRunDiagnostics( tdoc );

    if ( rc > 1 )
        rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );

    if ( rc >= 0 ) {
        /* Sanitise parse tree and then dump to output buffer */

        tnod = tidyGetBody(tdoc); /* NIL if empty */
        if (tnod)
            tidy_tree( tnod, tdoc, show_images);
        rc = tidySaveBuffer( tdoc, &output );
    }

    if ((rc >= 0) && output.bp) {
        /* Have sanitized output for user */
        for (s=(char *)output.bp; *s; s++)
            bputc(b, *s);
      tidyBufFree( &output );
    } else {
        session_log(session,
                    "[html_secure_tidy] A severe error (%d) occurred: %s\n",
                    rc, errbuf.bp);
    }

    tidyBufFree( &errbuf );
    tidyRelease( tdoc );
    return ((rc >= 0) ? T : NIL);
}
#else

BOOL html_secure_tidy(struct session *session, struct buffer *b,
                      BOOL show_images, char *input)
{
    return(NIL);
}

#endif
