前言

在一些文章类程序中，我们直接对文章内容检索的话，数据量大，速度较慢，我们可以在保存的时候获取文章的摘要，方便后续检索。

根据字数获取

这种方式可以作为文章概要。

不过滤

function getSummaryByContent(className,maxLength){
    maxLength = maxLength || 500;
    let showDom = document.querySelector("."+className);
    // 提取纯文本内容
    let textContent = showDom.innerText;
    // 生成摘要
    return textContent.substring(0, maxLength) + (textContent.length > maxLength ? "..." : "");
}

let description = getSummaryByContent("post-body",200);
console.info(description);

排除某些标签

排除代码标签

function getSummaryExcludeTag(className,maxLength){
    maxLength = maxLength || 500;
    // 获取包含要处理的 DOM 的父级元素
    const parentElement = document.querySelector("."+className);
    if(!parentElement){
        return "";
    }
    // 获取要移除的标签名
    const removedTagNames = ['pre','figure']; // 例如要移除 div 标签
    // 获取父级元素下的所有子节点
    const childNodes = parentElement.childNodes;
    // 创建一个新的 DocumentFragment 元素，用于保存处理后的子节点
    const divNew = document.createElement('div');
    // 遍历所有子节点，将不是要移除的标签添加到新的 DocumentFragment 中
    Array.from(childNodes).forEach(node => {
        if (node.nodeType === 1 && removedTagNames.indexOf(node.tagName.toLowerCase()) === -1) {
            divNew.appendChild(node.cloneNode(true)); // 使用 cloneNode 复制节点
        }
    });
    // 提取纯文本内容
    let textContent = divNew.innerText;
    // 生成摘要
    return textContent.substring(0, maxLength) + (textContent.length > maxLength ? "..." : "");
}

let description = getSummaryExcludeTag("post-body",400);
console.info(description);

排除某些样式

function getSummaryExcludeClass(className,maxLength){
    maxLength = maxLength || 500;
    // 获取包含要处理的 DOM 的父级元素
    const parentElement = document.querySelector("."+className);
    if(!parentElement){
        return "";
    }
    // 获取要移除的标签名
    const removedClassNames = ['highlight']; // 例如要移除包含的样式
    // 获取父级元素下的所有子节点
    const childNodes = parentElement.childNodes;
    // 创建一个新的 DocumentFragment 元素，用于保存处理后的子节点
    const divNew = document.createElement('div');
    // 遍历所有子节点，将不是要移除的标签添加到新的 DocumentFragment 中
    Array.from(childNodes).forEach(node => {
        if (node.nodeType === 1 && !removedClassNames.some(item => node.classList.contains(item))) {
            divNew.appendChild(node.cloneNode(true)); // 使用 cloneNode 复制节点
        }
    });
    // 提取纯文本内容
    let textContent = divNew.innerText;
    // 生成摘要
    return textContent.substring(0, maxLength) + (textContent.length > maxLength ? "..." : "");
}

let description = getSummaryExcludeClass("post-body",400);
console.info(description);

根据标题获取

这种方式可以作为检索关键字使用。

function getSummaryByTitle(className,maxLength){
    maxLength = maxLength || 500;
    let showDom = document.querySelector("."+className);
    // 获取所有标题元素
    const headings = showDom.querySelectorAll('h1, h2, h3, h4, h5, h6');
    let titleArr = Array.from(headings).map(heading=>heading.innerText || heading.textContent);
    let textContent = titleArr.join(" ");
    return textContent.substring(0, maxLength) + (textContent.length > maxLength ? "..." : "");
}

let description = getSummaryByTitle("post-body",200);
console.info(description);

打印

let showDom = document.querySelector(".post-body");
// 获取所有标题元素
const headings = showDom.querySelectorAll('h1, h2, h3, h4, h5, h6');
// 遍历每个标题元素并输出纯文本内容
headings.forEach((heading, index) => {
    const text = heading.innerText || heading.textContent;
    console.log(`标题 ${index + 1}: ${text}`);
});

获取高频词

只对英文有效

function getHighFrequencyWords(text, threshold) {
    // 将文本转换为小写并按空格分割成单词数组
    const words = text.toLowerCase().match(/\w+/g);

    // 统计每个单词出现的次数
    const wordCountMap = {};
    words.forEach(word => {
        wordCountMap[word] = (wordCountMap[word] || 0) + 1;
    });

    // 过滤出出现次数超过阈值的高频词
    const highFrequencyWords = Object.keys(wordCountMap).filter(word => wordCountMap[word] >= threshold);

    return highFrequencyWords;
}

let showDom = document.querySelector(".post-body");
// 提取纯文本内容
let textContent = showDom.innerText;

// 获取文本中出现次数超过2次的高频词
const highFrequencyWords = getHighFrequencyWords(textContent, 3);

console.log("高频词：", highFrequencyWords);

中文高频词需要引用中文分词的库。

我是码客，我是全栈工程师，我为自己代言。

JS获取富文本(HTML)的摘要

前言

根据字数获取

不过滤

排除某些标签

排除某些样式

根据标题获取

获取高频词