<?php

namespace Velis\Filter;

use DOMDocument;
use DOMElement;
use Laminas\Filter\FilterInterface;
use RuntimeException;
use Velis\Output;

/**
 * Utility class providing HTML sanitizer
 */
class Washtml implements FilterInterface
{
    /* Allowed HTML elements (default) */
    public static $htmlElements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
        'basefont', 'bdo', 'big', 'blockquote', 'br', 'caption', 'center',
        'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
        'dt', 'em', 'fieldset', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
        'ins', 'label', 'legend', 'li', 'map', 'menu', 'nobr', 'ol', 'p', 'pre', 'q',
        's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
        'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'wbr', 'img',
        // form elements
        'button', 'input', 'textarea', 'select', 'option', 'optgroup'
    ];

    /* Ignore these HTML tags and their content */
    public static $ignoreElements = ['script', 'applet', 'embed', 'object', 'style'];


    /* Allowed HTML attributes */
    public static $htmlAttribs = ['name', 'class', 'title', 'alt', 'width', 'height',
        'align', 'nowrap', 'col', 'row', 'id', 'rowspan', 'colspan', 'cellspacing',
        'cellpadding', 'valign', 'bgcolor', 'color', 'border', 'bordercolorlight',
        'bordercolordark', 'face', 'marginwidth', 'marginheight', 'axis', 'border',
        'abbr', 'char', 'charoff', 'clear', 'compact', 'coords', 'vspace', 'hspace',
        'cellborder', 'size', 'lang', 'dir', 'usemap', 'shape', 'media',
        // attributes of form elements
        'type', 'rows', 'cols', 'disabled', 'readonly', 'checked', 'multiple', 'value'
    ];

    /* Elements which could be empty and be returned in short form (<tag />) */
    public static $voidElements = ['area', 'base', 'br', 'col', 'command', 'embed', 'hr',
        'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
    ];


    /* State for linked objects in HTML */
    public $extlinks = false;

    /* Current settings */
    private $_config;

    /* Registered callback functions for tags */
    private $_handlers = [];

    /* Allowed HTML elements */
    private $_htmlElements;

    /* Ignore these HTML tags but process their content */
    private $_ignoreElements;

    /* Elements which could be empty and be returned in short form (<tag />) */
    private $_voidElements;

    /* Allowed HTML attributes */
    private $_htmlAttribs;

    /* Max nesting level */
    private $_maxNestingLevel;


    /**
     * @param array $p
     */
    public function __construct($p = array())
    {
        $this->_htmlElements   = array_flip((array)$p['htmlElements']) + array_flip(self::$htmlElements) ;
        $this->_htmlAttribs    = array_flip((array)$p['htmlAttribs']) + array_flip(self::$htmlAttribs);
        $this->_ignoreElements = array_flip((array)$p['ignoreElements']) + array_flip(self::$ignoreElements);
        $this->_voidElements   = array_flip((array)$p['voidElements']) + array_flip(self::$voidElements);

        unset($p['htmlElements'], $p['htmlAttribs'], $p['ignoreElements'], $p['voidElements']);

        $this->_config = $p + array('show_washed' => true, 'allow_remote' => false, 'cid_map' => array(), 'ommit_dump_comments' => false);
    }


    /**
     * Register a callback function for a certain tag
     * @param string $tagName
     * @param callable $callback
     */
    public function addCallback($tagName, $callback)
    {
        $this->_handlers[$tagName] = $callback;
    }


    /**
     * Check CSS style
     * @param string $style
     * @return string
     */
    private function _washStyle($style)
    {
        $s = '';

        foreach (explode(';', $style) as $declaration) {
            if (preg_match('/^\s*([a-z\-]+)\s*:\s*(.*)\s*$/i', $declaration, $match)) {
                $cssid = $match[1];
                $str   = $match[2];
                $value = '';

                while (
                    strlen($str) > 0 &&
                    preg_match('/^(url\(\s*[\'"]?([^\'"\)]*)[\'"]?\s*\)' . /*1,2*/
                        '|rgb\(\s*[0-9]+\s*,\s*[0-9]+\s*,\s*[0-9]+\s*\)' .
                        '|-?[0-9.]+\s*(em|ex|px|cm|mm|in|pt|pc|deg|rad|grad|ms|s|hz|khz|%)?' .
                        '|#[0-9a-f]{3,6}' .
                        '|[a-z0-9", -]+' .
                        ')\s*/i', $str, $match)
                ) {
                    if (isset($match[2]) && $match[2]) {
                        if (
                            ($src = $this->_config['cid_map'][$match[2]])
                            || ($src = $this->_config['cid_map'][$this->_config['base_url'] . $match[2]])
                        ) {
                            $value .= ' url(' . htmlspecialchars($src, ENT_QUOTES) . ')';
                        } else if (preg_match('!^(https?:)?//[a-z0-9/._+-]+$!i', $match[2], $url)) {
                            if ($this->_config['allow_remote']) {
                                $value .= ' url(' . htmlspecialchars($url[0], ENT_QUOTES) . ')';
                            } else {
                                $this->extlinks = true;
                            }
                        } else if (preg_match('/^data:.+/i', $match[2])) { // RFC2397
                            $value .= ' url(' . htmlspecialchars($match[2], ENT_QUOTES) . ')';
                        }
                    } else {
                        // whitelist ?
                        $value .= ' ' . $match[0];

                        // #1488535: Fix size units, so width:800 would be changed to width:800px
                        if (
                            preg_match('/(left|right|top|bottom|width|height)/i', $cssid)
                            && preg_match('/^[0-9]+$/', $match[0])
                        ) {
                            $value .= 'px';
                        }
                    }

                    $str = substr($str, strlen($match[0]));
                }

                if (isset($value[0])) {
                    $s .= ($s ? ' ' : '') . $cssid . ':' . $value . ';';
                }
            }
        }

        return $s;
    }


    /**
     * Take a node and return allowed attributes and check values
     * @param DOMElement $node
     * @return string
     */
    private function _washAttribs($node)
    {
        $t = '';
        $washed = '';

        foreach ($node->attributes as $key => $plop) {
            $key   = strtolower($key);
            $value = $node->getAttribute($key);

            if (
                isset($this->_htmlAttribs[$key]) ||
                ($key == 'href' &&
                ($value = trim($value)) &&
                    !preg_match('!^(javascript|vbscript|data:text)!i', $value) &&
                    preg_match('!^([a-z]|[\/][a-z-0-9.+-]+|:|\/\/|#).+!i', $value))
            ) {
                $t .= ' ' . $key . '="' . htmlspecialchars($value, ENT_QUOTES) . '"';
            } else if ($key == 'style' && ($style = $this->_washStyle($value))) {
                $quot = strpos($style, '"') !== false ? "'" : '"';
                $t .= ' style=' . $quot . $style . $quot;
            } else if ($key == 'background' || ($key == 'src' && strtolower($node->tagName) == 'img')) { //check tagName anyway
                if (
                    ($src = $this->_config['cid_map'][$value])
                    || ($src = $this->_config['cid_map'][$this->_config['base_url'] . $value])
                ) {
                    $t .= ' ' . $key . '="' . htmlspecialchars($src, ENT_QUOTES) . '"';
                } else if (preg_match('/^(http|https|ftp):.+/i', $value)) {
                    if ($this->_config['allow_remote']) {
                        $t .= ' ' . $key . '="' . htmlspecialchars($value, ENT_QUOTES) . '"';
                    } else {
                        $this->extlinks = true;
                        if ($this->_config['blocked_src']) {
                            $t .= ' ' . $key . '="' . htmlspecialchars($this->_config['blocked_src'], ENT_QUOTES) . '"';
                        }
                    }
                } else if (preg_match('/^data:.+/i', $value)) { // RFC2397
                    $t .= ' ' . $key . '="' . htmlspecialchars($value, ENT_QUOTES) . '"';
                }
            } else {
                $washed .= ($washed ? ' ' : '') . $key;
            }
        }

        return $t . ($washed && $this->_config['show_washed'] ? ' x-washed="' . $washed . '"' : '');
    }


    /**
     * The main loop that recurse on a node tree.
     * It output only allowed tags with allowed attributes
     * and allowed inline styles
     * @param DOMElement $node
     * @param int $level
     * @return string
     */
    private function dumpHtml($node, $level = 0)
    {
        if (!$node->hasChildNodes()) {
            return '';
        }

        $level++;

        if ($this->_maxNestingLevel > 0 && $level == $this->_maxNestingLevel - 1) {
            // log error message once
            if (!$this->_maxNestingLevel_error) {
                $this->_maxNestingLevel_error = true;
                throw new RuntimeException("Maximum nesting level exceeded (xdebug._maxNestingLevel={$this->_maxNestingLevel})");
            }
            return (!$this->_config['ommit_dump_comments']) ? '<!-- ignored -->' : '';
        }

        $node = $node->firstChild;
        $dump = '';

        do {
            switch ($node->nodeType) {
                case XML_ELEMENT_NODE: //Check element
                    $tagName = strtolower($node->tagName);
                    if (isset($this->_handlers[$tagName]) && $callback = $this->_handlers[$tagName]) {
                        $dump .= call_user_func(
                            $callback,
                            $tagName,
                            $this->_washAttribs($node),
                            $this->dumpHtml($node, $level),
                            $this
                        );
                    } else if (isset($this->_htmlElements[$tagName])) {
                        $content = $this->dumpHtml($node, $level);
                        $dump .= '<' . $tagName . $this->_washAttribs($node) .
                        ($content === '' && isset($this->_voidElements[$tagName]) ? ' />' : ">$content</$tagName>");
                    } else if (isset($this->_ignoreElements[$tagName])) {
                        if (!$this->_config['ommit_dump_comments']) {
                            $dump .= '<!-- ' . htmlspecialchars($tagName, ENT_QUOTES) . ' not allowed -->';
                        }
                    } else {
                        if (!$this->_config['ommit_dump_comments']) {
                            $dump .= '<!-- ' . htmlspecialchars($tagName, ENT_QUOTES) . ' ignored -->';
                        }
                        $dump .= $this->dumpHtml($node, $level); // ignore tags not its content
                    }
                    break;

                case XML_CDATA_SECTION_NODE:
                    $dump .= $node->nodeValue;
                    break;

                case XML_TEXT_NODE:
                    $dump .= htmlspecialchars($node->nodeValue);
                    break;

                case XML_HTML_DOCUMENT_NODE:
                    $dump .= $this->dumpHtml($node, $level);
                    break;

                case XML_DOCUMENT_TYPE_NODE:
                    break;

                default:
                    if (!$this->_config['ommit_dump_comments']) {
                        $dump .= '<!-- node type ' . $node->nodeType . ' -->';
                    }
            }
        } while ($node = $node->nextSibling);

        return $dump;
    }


    /**
     * Main function, give it untrusted HTML, tell it if you allow loading
     * remote images and give it a map to convert "cid:" urls.
     * @param string $html
     * @return string
     */
    public function filter($html)
    {
        if (!mb_detect_encoding($html)) {
            $words        = explode(' ', $html);
            $wordsConv    = array();

            foreach ($words as $word) {
                if (!mb_check_encoding($word)) {
                    $wordsConv[] = @mb_convert_encoding($word, 'utf-8');
                } else {
                    $wordsConv[] = $word;
                }
            }
            $html = implode(' ', $wordsConv);
        }

        // Charset seems to be ignored (probably if defined in the HTML document)
        $node = new DOMDocument('1.0', mb_detect_encoding($html));
        $this->extlinks = false;

        $html = Output::xssEntityDecode($html);
        $html = @mb_convert_encoding($html, 'html-entities', mb_detect_encoding($html));
        $html = $this->cleanup($html);

        // Find base URL for images
        if (preg_match('/<base\s+href=[\'"]*([^\'"]+)/is', $html, $matches)) {
            $this->_config['base_url'] = $matches[1];
        } else {
            $this->_config['base_url'] = '';
        }

        // Detect max nesting level (for dumpHTML) (#1489110)
        $this->_maxNestingLevel = (int) @ini_get('xdebug._maxNestingLevel');

        @$node->loadHTML($html);

        return $this->dumpHtml($node);
    }


    /**
     * Invoke magic function
     *
     * @param string $html
     * @return string
     */
    public function __invoke($html)
    {
        return $this->filter($html);
    }


    /**
     * Getter for config parameters
     * @param $prop
     * @return mixed
     */
    public function getConfig($prop)
    {
        return $this->_config[$prop];
    }


    /**
     * Clean HTML input
     * @param string $html
     * @return string|string[]|null
     */
    private function cleanup($html)
    {
        // special replacements (not properly handled by washtml class)
        $html_search = array(
            '/(<\/nobr>)(\s+)(<nobr>)/i',       // space(s) between <NOBR>
            '/<title[^>]*>[^<]*<\/title>/i',    // PHP bug #32547 workaround: remove title tag
            '/^(\0\0\xFE\xFF|\xFF\xFE\0\0|\xFE\xFF|\xFF\xFE|\xEF\xBB\xBF)/',    // byte-order mark (only outlook?)
            '/<html\s[^>]+>/i',                 // washtml/DOMDocument cannot handle xml namespaces
        );

        $html_replace = array(
            '\\1' . ' &nbsp; ' . '\\3',
            '',
            '',
            '<html>',
        );
        $html = preg_replace($html_search, $html_replace, trim($html));

        // PCRE errors handling (#1486856), should we use something like for every preg_* use?
        if ($html === null && ($preg_error = preg_last_error()) != PREG_NO_ERROR) {
            $errstr = "Could not clean up HTML message! PCRE Error: $preg_error.";

            if ($preg_error == PREG_BACKTRACK_LIMIT_ERROR) {
                $errstr .= " Consider raising pcre.backtrack_limit!";
            }
            if ($preg_error == PREG_RECURSION_LIMIT_ERROR) {
                $errstr .= " Consider raising pcre.recursion_limit!";
            }

            throw new RuntimeException($errstr);
        }

        // fix (unknown/malformed) HTML tags before "wash"
        $html = preg_replace_callback('/(<[\/]*)([^\s>]+)/', array($this, 'htmlTagCallback'), $html);

        // Remove invalid HTML comments (#1487759)
        // Don't remove valid conditional comments
        // Don't remove MSOutlook (<!-->) conditional comments (#1489004)
        $html = preg_replace('/<!--[^->\[\n]+>/', '', $html);

        return $html;
    }


    /**
     * Callback function for HTML tags fixing
     * @param string[] $matches
     * @return string
     */
    public static function htmlTagCallback($matches)
    {
        $tagname = $matches[2];
        $tagname = preg_replace(array(
            '/:.*$/',               // Microsoft's Smart Tags <st1:xxxx>
            '/[^a-z0-9_\[\]\!-]/i', // forbidden characters
        ), '', $tagname);

        return $matches[1] . $tagname;
    }
}
