html.ts 10.8 KB
import nodeUtil from 'util'
import NodeCache from 'node-cache'

import type { NodeType } from 'ultrahtml'
import {
  parse as ultraParse,
  ELEMENT_NODE,
  TEXT_NODE,
  walkSync,
} from 'ultrahtml'

import type { AST } from 'parsel-js'
import {
  parse as elParse,
  specificity as getSpecificity,
  specificityToNumber,
} from 'parsel-js'

import { parsedHtmlCache, selectorCache, findNodeCache } from './cache.js'

export const fixAttributes = (attributes, options = { mapClassname: true }) => {
  const { mapClassname } = options
  if (!mapClassname) return attributes
  const { class: className, ...rest } = attributes
  return { ...rest, className }
}

export const appendClasses = (node, extraClasses) => {
  if (!node.attributes.class && !extraClasses) return node
  const { attributes: { class: className, ...attributes } = {} } = node
  return { ...node, attributes: { ...attributes, class: `${className ? className + ' ' : ''}${extraClasses}` } }
}

// TODO: implement a parent/child/element cache
const filterChildElements = (parent) => parent?.children?.filter(n => n.type === ELEMENT_NODE) || []
const nthChildPos = (node, parent) => filterChildElements(parent).findIndex((child) => child === node);

type Matcher = (context, node, parent, i, debug) => boolean
type AttrValueMatch = (string) => string

const makeNthChildPosMatcher = (ast: AST): Matcher => {
  const { argument } = ast
  const n = Number(argument)
  if (!Number.isNaN(n)) {
    return (context, node, parent, i, debug) => {
      return i === n
    }
  }
  switch (argument) {
    case 'odd':
      return (context, node, parent, i, debug) => Math.abs(i % 2) === 1
    case 'even':
      return (context, node, parent, i) => i % 2 === 0
    default: {
      if (!argument) throw new Error(`Unsupported empty nth-child selector!`)
      let [_, A = '1', B = '0'] = /^\s*(?:(-?(?:\d+)?)n)?\s*\+?\s*(\d+)?\s*$/gm.exec(argument) ?? []
      if (A.length === 0) A = '1'
      const a = Number.parseInt(A === '-' ? '-1' : A)
      const b = Number.parseInt(B)
      const nth = (index) => (a * n) + b
      return (context, node, parent, i) => {
        const elements = filterChildElements(parent)
        for (let index = 0; index < elements.length; index++) {
          const n = nth(index)
          if (n > elements.length) return false
          if (n === i) return true
        }
        return false
      }
    }
  }
}

const getAttrValueMatch = (value: string, operator: string = '=', caseSenstive: boolean): AttrValueMatch => {
  if (value === undefined) return (attrValue) => attrValue !== undefined
  const isCaseInsenstive = caseSensitive === 'i'
  if (isCaseInsensitive) value = value.toLowerCase()
  const adjustMatcher = (matcher) => isCaseInsensitive ? (attrValue) => matcher(attrValue.toLowerCase()) : matcher
  switch (operator) {
    case '=': return (attrValue) => value === attrValue
    case '~=': {
      const keys = value.split(/\s+/g).reduce((keys, item) => {
        keys[ item ] = true
        return keys
      }, {})
      return adjustMatcher((attrValue) => keys[ attrValue ])
    }
    case '|=': return adjustMatcher((attrValue) => value.startsWith(attrValue + '-'))
    case '*=': return adjustMatcher((attrValue) => value.indexOf(attrValue) > -1)
    case '$=': return adjustMatcher((attrValue) => value.endsWith(attrValue))
    case '^=': return adjustMatcher((attrValue) => value.startsWith(attrValue))
  }
  return (attrValue) => false
}

const compileMatcher = (ast: AST, selector: string): Matcher => {
  let counter = 0

  const neededContext = []
  const makeMatcher = (ast: AST) => {
    //console.log('makeMatcher', ast)
    switch (ast.type) {
      case 'list': {
        const matchers = ast.list.map(s => makeMatcher(s))
        return (context, node, parent, i, debug) => {
          for (const matcher of matchers) {
            if (!matcher(context, node, parent, i)) return false
          }
          return true
        }
      }
      case 'compound': {
        const matchers = ast.list.map(s => makeMatcher(s))
        return (context, node, parent, i, debug) => {
          for (const matcher of matchers) {
            if (!matcher(context, node, parent, i)) return false
          }
          return true
        }
      }
      case 'complex': {
        const { left, right, combinator, pos } = ast
        const leftMatcher = makeMatcher(left)
        const rightMatcher = makeMatcher(right)
        const setCounter = counter++
        neededContext[ setCounter ] = () => new WeakSet()
        return (context, node, parent, i, debug) => {
          const seen = context[ setCounter ]
          if (leftMatcher(context, node, parent, i, debug)) {
            if (debug) console.log('matched on left', { left, right, combinator, pos, parent })
            // TODO: Check seen.has(), and maybe skip calling leftMatcher?
            seen.add(node)
          } else if (parent && seen.has(parent) && combinator === ' ') {
            seen.add(node)
          }
          if (!rightMatcher(context, node, parent, i, debug)) return false
          seen.add(node)
          if (debug) console.log('matched on right', { left, right, combinator, pos, node, parent })
          switch (combinator) {
            case ' ':
              let parentPtr = parent
              while (parentPtr) {
                if (seen.has(parentPtr)) return true
                parentPtr = parentPtr.parent
              }
              return false
            case '>':
              if (debug) console.log('seen parent', seen.has(parent))
              return parent ? seen.has(parent) : false
            case '+': {
              if (!parent) return false
              let prevSiblings = parent.children.slice(0, i).filter((el) => el.type === ELEMENT_NODE)
              if (prevSiblings.length === 0) return false
              const prev = prevSiblings[prevSiblings.length - 1]
              if (!prev) return false
              if (seen.has(prev)) return true
              return false
            }
            case '~': {
              if (!parent) return false
              let prevSiblings = parent.children.slice(0, i).filter((el) => el.type === ELEMENT_NODE)
              if (prevSiblings.length === 0) return false
              for (const prev of prevSiblings) {
                if (seen.has(prev)) return true
              }
              return false
            }
            default:
              return false
          }
        }
      }
      case 'type': {
        const { name, content } = ast
        if (content === '*') return (context, node,  parent, i) => true
        return (context, node,  parent, i, debug) => node.name === name
      }
      case 'class': {
        const { name } = ast
        return (context, node,  parent, i, debug) => node.attributes?.['class']?.split(/\s+/g).includes(name)
      }
      case 'id': {
        const { name } = ast
        return (context, node,  parent, i, debug) => node.attributes?.id === name
      }
      case 'pseudo-class':
        switch (ast.name) {
          case 'global':
            return makeMatcher(elParse(ast.argument))
          case 'not': {
            const matcher = makeMatcher(ast.subtree)
            return (...args) => !matcher(...args)
          }
          case 'is':
            return makeMatcher(ast.subtree)
          case 'where':
            return makeMatcher(ast.subtree)
          case 'root':
            return (context, node,  parent, i) => !node.parent
          case 'empty':
            return (context, node,  parent, i, debug) => {
              if (node.type !== ELEMENT_NODE) return false
              const { children } = node
              if (children.length === 0) return false
              return children.every(child => child.type === TEXT_NODE && child.value.trim() === '')
            }
          case 'first-child':
            return (context, node, parent, i, debug) => {
              return parent?.children.findFirst(child => child.type === ELEMENT_NODE) === node
            }
          case 'last-child':
            return (context, node, parent, i, debug) => {
              return parent?.children.findLast(child => child.type === ELEMENT_NODE) === node
            }
          case 'only-child':
            return (context, node, parent, i, debug) => {
              // TODO: This can break-early after it finds the second element
              return filterChildElements(parent).length === 1
            }
          case 'nth-child': {
            const nthChildMatcher = makeNthChildPosMatcher(ast)
            return (context, node, parent, i, debug) => {
              const pos = nthChildPos(node, parent) + 1
              return nthChildMatcher(context, node, parent, pos, debug)
            }
          }
          default:
            throw new Error(`Unknown pseudo-class: ${ast.name}`)
        }
      case 'attribute':
        const { caseSensitive, name, value, operator } = ast
        const attrValueMatch = getAttrValueMatch(value, operator, caseSenstive)
        return (context, node,  parent, i, debug) => {
          const { attributes: { [ name ]: attrValue } = {} } = node
          return attrValueMatch(attrValue)
        }
      case 'universal':
        return (context, node,  parent, i, debug) => true
      default:
        throw new Error(`Unhandled ast: ${ast.type}`)
    }
  }
  const matcher = makeMatcher(ast)
  return () => {
    const context = neededContext.map(item => item())
    const nodeMatcher = (node, parent, i, debug) => {
      //if (debug) console.log('starting to match', {node, context})
      return matcher(context, node, parent, i, debug)
    }
    nodeMatcher.toString = () => {
      return '[matcher:' + selector + ']'
    }
    return nodeMatcher
  }
}

export const createMatcher = (selector: string) => {
  const matcherCreater = selectorCache.get(selector)
  if (false && matcherCreater) return matcherCreater()
  const ast = elParse(selector)
  console.log('createMatcher', nodeUtil.inspect({ selector, ast }, { depth: null, colors: true }))
  const newMatcherCreater = compileMatcher(ast, selector)
  selectorCache.set(selector, newMatcherCreater)
  return newMatcherCreater()
}

export const parseHtml = (html: string) => {
  const cached = parsedHtmlCache.get(html)
  if (cached) return cached
  const doc = ultraParse(html)
  parsedHtmlCache.set(html, doc)
  return doc
}

export const findNode = (doc: NodeType, selector: string) => {
  if (!selector) return doc
  let docCache = findNodeCache.get(doc)
  if (!docCache) {
    docCache = new NodeCache({ stdTTL: 10*60, useClones: false })
    findNodeCache.set(doc, docCache)
  }
  const found = docCache.get(selector)
  if (found !== undefined) return found[0]
  //console.log('cache miss', {selector})
  const matcher = createMatcher(selector)
  try {
    walkSync(doc, (node, parent, index) => {
      if (matcher(node, parent, index)) throw node
    })
  } catch (e) {
    if (e instanceof Error) throw e
    docCache.set(selector, [ e ])
    return e
  }
}