html.ts 14.4 KB
import nodeUtil from 'util'
import TTLCache from '@isaacs/ttlcache'
import { decode } from 'html-entities'

import type { NodeType } from 'ultrahtml'
import {
  parse as ultraParse,
  ELEMENT_NODE,
  TEXT_NODE,
  walkSync,
} from 'ultrahtml'

import type { AST } from 'parsel-js'
import {
  parse as elParse,
  specificity as getSpecificity,
  specificityToNumber,
} from 'parsel-js'

import { parsedHtmlCache, selectorCache, findNodeCache } from './cache.js'

export const fixAttributes = (attributes, options = { mapClassname: true }) => {
  const { mapClassname } = options
  if (!mapClassname) return attributes
  const { class: className, ...rest } = attributes
  return { ...rest, className }
}

export const appendClasses = (node, extraClasses) => {
  if (!node.attributes.class && !extraClasses) return node
  const { attributes: { class: className, ...attributes } = {} } = node
  return { ...node, attributes: { ...attributes, class: `${className ? className + ' ' : ''}${extraClasses}` } }
}

// TODO: implement a parent/child/element cache
const nthChildPos = (node, parent) => filterChildElements(parent).findIndex((child) => child === node);

const filterChildElementsMatcher = (context, child, parent, i) => child.type === ELEMENT_NODE

type Matcher = (context, node: NodeType, parent?: NodeType, i: number, debug: number) => boolean
type MatcherProducer = () => Matcher
type AttrValueMatch = (string) => string

const compileMatcher = (ast: AST, selector: string): MatcherProducer => {
  let counter = 0

  const neededContext = []
  const selectorCacheCounter = counter++
  neededContext[ selectorCacheCounter ] = () => new WeakMap()

  const findChildren = (context, parent?: NodeType, selector: string, matcher: Matcher): array[NodeType] => {
    if (parent === null) return []
    let selectorCache = context[ selectorCacheCounter ].get(parent)
    if (!selectorCache) context[ selectorCacheCounter ].set(parent, selectorCache = {})
    const selectorResult = selectorCache[ selector ]
    if (selectorResult) return selectorResult
    const newResult = parent?.children?.filter((child, index) => matcher(context, child, parent, index)) || []
    selectorCache[ selector ] = newResult
    return newResult
  }

  const makeNthChildPosMatcher = (argument: string): Matcher => {
    const n = Number(argument)
    if (!Number.isNaN(n)) {
      // Simple variant, just a number
      return (context, node, parent, i, debug) => {
        return i === n
      }
    }

    switch (argument) {
      case 'odd':
        return (context, node, parent, i, debug) => Math.abs(i % 2) === 1
      case 'even':
        return (context, node, parent, i) => i % 2 === 0
      default: {
        if (!argument) throw new Error(`Unsupported empty nth-child selector!`)
        let [_, A, B = '0'] = /^\s*(?:(-?(?:\d+)?)n)?\s*\+?\s*(\d+)?\s*$/gm.exec(argument) ?? []
        const b = Number.parseInt(B)
        // (index) => (index - b) / a
        let nMatch
        if (A === undefined || A === '0' || A === '-0') {
          nMatch = (i) => (i - b) === 0
        } else {
          const a = A === '' ? 1 : A === '-' ? -1 : Number.parseInt(A)
          if (a < 0) {
            nMatch = (i) => { const n = -(i - b) / a; return n !== 0 && Math.floor(n) === n }
          } else {
            nMatch = (i) => { const n = (i - b) / a; return n !== 0 && Math.floor(n) === n }
          }
        }
        return (context, node, parent, i, debug) => {
          return nMatch(i)
        }
      }
    }
  }

  const getAttrValueMatch = (value: string, operator: string = '=', caseSensitive: boolean): AttrValueMatch => {
    if (value === undefined) return (attrValue) => attrValue !== undefined
    if (value[ 0 ] === '"' || value[ 0 ] === '\'') value = value.substring(1, value.length - 1)
    const isCaseInsensitive = caseSensitive === 'i'
    if (isCaseInsensitive) value = value.toLowerCase()
    const adjustMatcher = (matcher) => isCaseInsensitive ? (attrValue) => matcher(attrValue.toLowerCase()) : matcher
    switch (operator) {
      case '=': return (attrValue) => value === attrValue
      case '~=': {
        const keys = value.split(/\s+/g).reduce((keys, item) => {
          keys[ item ] = true
          return keys
        }, {})
        return adjustMatcher((attrValue) => keys[ attrValue ])
      }
      case '|=': return adjustMatcher((attrValue) => value.startsWith(attrValue + '-'))
      case '*=': return adjustMatcher((attrValue) => value.indexOf(attrValue) > -1)
      case '$=': return adjustMatcher((attrValue) => value.endsWith(attrValue))
      case '^=': return adjustMatcher((attrValue) => value.startsWith(attrValue))
    }
    return (attrValue) => false
  }

  const makeMatcher = (ast: AST) => {
    //console.log('makeMatcher', ast)
    switch (ast.type) {
      case 'list': {
        const matchers = ast.list.map(s => makeMatcher(s))
        return (context, node, parent, i, debug) => {
          for (const matcher of matchers) {
            if (!matcher(context, node, parent, i)) return false
          }
          return true
        }
      }
      case 'compound': {
        const matchers = ast.list.map(s => makeMatcher(s))
        return (context, node, parent, i, debug) => {
          for (const matcher of matchers) {
            if (!matcher(context, node, parent, i)) return false
          }
          return true
        }
      }
      case 'complex': {
        const { left, right, combinator, pos } = ast
        const leftMatcher = makeMatcher(left)
        const rightMatcher = makeMatcher(right)
        const setCounter = counter++
        neededContext[ setCounter ] = () => new WeakSet()
        return (context, node, parent, i, debug) => {
          const seen = context[ setCounter ]
          if (leftMatcher(context, node, parent, i, debug)) {
            if (debug) console.log('matched on left', { left, right, combinator, pos, parent })
            // TODO: Check seen.has(), and maybe skip calling leftMatcher?
            seen.add(node)
          } else if (parent && seen.has(parent) && combinator === ' ') {
            seen.add(node)
          }
          if (!rightMatcher(context, node, parent, i, debug)) return false
          seen.add(node)
          if (debug) console.log('matched on right', { left, right, combinator, pos, node, parent })
          switch (combinator) {
            case ' ':
              let parentPtr = parent
              while (parentPtr) {
                if (seen.has(parentPtr)) return true
                parentPtr = parentPtr.parent
              }
              return false
            case '>':
              if (debug) console.log('seen parent', seen.has(parent))
              return parent ? seen.has(parent) : false
            case '+': {
              if (!parent) return false
              let prevSiblings = parent.children.slice(0, i).filter((el) => el.type === ELEMENT_NODE)
              if (prevSiblings.length === 0) return false
              const prev = prevSiblings[prevSiblings.length - 1]
              if (!prev) return false
              if (seen.has(prev)) return true
              return false
            }
            case '~': {
              if (!parent) return false
              let prevSiblings = parent.children.slice(0, i).filter((el) => el.type === ELEMENT_NODE)
              if (prevSiblings.length === 0) return false
              for (const prev of prevSiblings) {
                if (seen.has(prev)) return true
              }
              return false
            }
            default:
              return false
          }
        }
      }
      case 'type': {
        const { name, content } = ast
        if (content === '*') return (context, node,  parent, i) => true
        return (context, node,  parent, i, debug) => node.name === name
      }
      case 'class': {
        const { name } = ast
        return (context, node,  parent, i, debug) => node.attributes?.['class']?.split(/\s+/g).includes(name)
      }
      case 'id': {
        const { name } = ast
        return (context, node,  parent, i, debug) => node.attributes?.id === name
      }
      case 'pseudo-class':
        switch (ast.name) {
          case 'global':
            return makeMatcher(elParse(ast.argument))
          case 'not': {
            const matcher = makeMatcher(ast.subtree)
            return (...args) => !matcher(...args)
          }
          case 'is':
            return makeMatcher(ast.subtree)
          case 'where':
            return makeMatcher(ast.subtree)
          case 'root':
            return (context, node,  parent, i) => !node.parent
          case 'empty':
            return (context, node,  parent, i, debug) => {
              if (node.type !== ELEMENT_NODE) return false
              const { children } = node
              if (children.length === 0) return false
              return children.every(child => child.type === TEXT_NODE && child.value.trim() === '')
            }
          case 'first-child':
            return (context, node, parent, i, debug) => {
              return parent?.children.findFirst(child => child.type === ELEMENT_NODE) === node
            }
          case 'last-child':
            return (context, node, parent, i, debug) => {
              return parent?.children.findLast(child => child.type === ELEMENT_NODE) === node
            }
          case 'only-child':
            return (context, node, parent, i, debug) => {
              // TODO: This can break-early after it finds the second element
              return findChildren(context, parent, 'ELEMENT', filterChildElementsMatcher).length === 1
            }
          // case 'nth-of-type':
          // case 'nth-last-of-type':
          // case 'nth-last-child':
          case 'nth-child': {
            //console.log('nth-child:ast', ast)
            const argument = ast.subtree ? ast.argument.replace(/\s*of\s+.*$/, '') : ast.argument
            const nthChildMatcher = makeNthChildPosMatcher(argument)
            let subDebug, childSelector, childMatcher
            if (ast.subtree) {
              subDebug = true
              childSelector = ast.content
              childMatcher = makeMatcher(ast.subtree)
            } else {
              subDebug = false
              childSelector = 'ELEMENT'
              childMatcher = filterChildElementsMatcher
            }
            
            return (context, node, parent, i, debug) => {
              const children = findChildren(context, parent, childSelector, childMatcher)
              const pos = children.indexOf(node)
              if (parent?.name === 'body' && pos !== -1) {
                //console.log('nth-child:debug', {parent, childSelector, children, node, pos})
              }
              if (pos === -1) return false
              return nthChildMatcher(context, node, parent, pos + 1, debug || parent?.name === 'body')
            }
          }
          default:
            console.error('pseudo-class', nodeUtil.inspect({ selector, ast }, { depth: null, colors: true }))
            throw new Error(`Unknown pseudo-class: ${ast.name}`)
        }
      case 'attribute':
        const { caseSensitive, name, value, operator } = ast
        const attrValueMatch = getAttrValueMatch(value, operator, caseSensitive)
        return (context, node,  parent, i, debug) => {
          const { attributes: { [ name ]: attrValue } = {} } = node
          return attrValueMatch(attrValue)
        }
      case 'universal':
        return (context, node,  parent, i, debug) => true
      default:
        throw new Error(`Unhandled ast: ${ast.type}`)
    }
  }
  const matcher = makeMatcher(ast)
  return () => {
    const context = neededContext.map(item => item())
    const nodeMatcher = (node, parent, i, debug) => {
      //if (debug) console.log('starting to match', {node, context})
      return matcher(context, node, parent, i, debug)
    }
    nodeMatcher.toString = () => {
      return '[matcher:' + selector + ']'
    }
    return nodeMatcher
  }
}

export const createMatcher = (selector: string): Matcher => {
  const matcherCreater = selectorCache.get(selector)
  if (matcherCreater) return matcherCreater()
  const ast = elParse(selector)
  //console.log('createMatcher', nodeUtil.inspect({ selector, ast }, { depth: null, colors: true }))
  const newMatcherCreater = compileMatcher(ast, selector)
  selectorCache.set(selector, newMatcherCreater)
  return newMatcherCreater()
}

const reactAttributeMap = {
  'class': 'className',
  'srcset': 'srcSet',
  'maxlength': 'maxLength',
}

class NodeProxyHandler {
  #options
  #cache
  constructor(options = {}) {
    this.#options = options
    this.#cache = {}
  }

  get(target, prop, receiver) {
    const { _proxy } = target
    if (prop in this.#cache) return this.#cache[ prop ]
    const { [ prop ]: origValue } = target
    if (!origValue) return origValue
    switch (prop) {
      case 'attributes':
        return this.#cache[ prop ] = (origValue ? Object.fromEntries(Object.entries(origValue).map(([ key, value ]) => {
          const decoded = decode(value)
          if (this.#options.react) {
            const { [ key ]: newKey } = reactAttributeMap
            if (newKey) return [ newKey, decoded ]
          }
          return [ key, decoded ]
        })) : origValue)
        return newValue
      case 'parent':
        return this.#cache[ prop ] = proxyNode(origValue, this.#options)
      case 'children':
        return this.#cache[ prop ] = origValue.map((child) => proxyNode(child, this.#options))
      default:
        if (typeof origValue === 'function') {
          return this.#cache[ prop ] = (...args) => {
            origValue.apply(target, ...args)
          }
        }
        return origValue
    }
  }
}

const proxyNode = (node, options) => {
  return new Proxy(node, new NodeProxyHandler(options))
}

export const parseHtml = (html: string, options): NodeType => {
  const cached = parsedHtmlCache.get(html)
  if (cached) return proxyNode(cached, options)
  const doc = ultraParse(html)
  parsedHtmlCache.set(html, doc)
  return proxyNode(doc, options)
}

export const findNode = (doc: NodeType, selector: string): NodeType => {
  if (!selector) return doc
  let docCache = findNodeCache.get(doc)
  if (!docCache) {
    docCache = new TTLCache({ ttl: 10*60 })
    findNodeCache.set(doc, docCache)
  }
  const found = docCache.get(selector)
  if (found !== undefined) return found[0]
  //console.log('cache miss', {selector})
  const matcher = createMatcher(selector)
  try {
    walkSync(doc, (node, parent, index) => {
      if (matcher(node, parent, index)) throw node
    })
  } catch (e) {
    if (e instanceof Error) throw e
    docCache.set(selector, [ e ])
    return e
  }
}