当前位置: 首页 > 工具软件 > ATOM-Gitee > 使用案例 >

html2article java_article.go · Gitee 极速下载/html2article-golang - Gitee.com

东郭阳德
2023-12-01

package html2article

import (

"net/url"

"path"

"strings"

"golang.org/x/net/html"

"golang.org/x/net/html/atom"

)

type Article struct {

// Basic

Html string `json:"content_html"`

Content string `json:"content"`

Title string `json:"title"`

Publishtime int64 `json:"publish_time"`

// Others

Images []string `json:"images"`

ReadContent string `json:"read_content"`

contentNode *html.Node

}

func (a *Article) Readable(urlStr string) {

a.ParseReadContent()

a.ParseImage(urlStr)

}

// ParseReadContent parse the ReadContent to be readability

func (a *Article) ParseReadContent() {

a.cleanAttrs(a.contentNode, "class", "id", "style", "width", "height", "onclick", "onmouseover", "border")

a.clean(a.contentNode, func(n *html.Node) bool {

return n.Type == html.CommentNode || n.DataAtom == atom.Script || n.DataAtom == atom.Object

})

a.ReadContent, _ = getHtml(a.contentNode)

// a.ReadContent = CompressHtml(a.ReadContent)

}

// ParseImage parse the image src to the absolute path

func (a *Article) ParseImage(urlStr string) {

_url, err := url.Parse(urlStr)

if err != nil {

return

}

mp := make(map[string]string)

for i, _ := range a.Images {

if strings.Index(a.Images[i], "http") != 0 {

var newImg string

if strings.Index(a.Images[i], "//") == 0 {

newImg = _url.Scheme + ":" + a.Images[i]

} else if strings.Index(a.Images[i], "/") == 0 {

newImg = _url.Scheme + "://" + _url.Host + a.Images[i]

} else {

newImg = _url.Scheme + "://" + _url.Host + path.Join(path.Dir(_url.RequestURI()), a.Images[i])

}

mp[a.Images[i]] = newImg

a.Images[i] = newImg

}

}

for k, v := range mp {

a.Html = strings.Replace(a.Html, k, v, -1)

a.ReadContent = strings.Replace(a.ReadContent, k, v, -1)

}

}

func (a *Article) Paragraphs() []string {

paras := []string{}

walk(a.contentNode, func(n *html.Node) bool {

if isTag(atom.P)(n) {

text := Compress(text(n))

if text != "" {

paras = append(paras, text)

}

return false

} else {

return true

}

})

return paras

}

func (a *Article) clean(sel *html.Node, toClean selector) {

for c := sel.FirstChild; c != nil; c = c.NextSibling {

if toClean(c) {

pre := c.PrevSibling

sel.RemoveChild(c)

c = pre

} else {

a.clean(c, toClean)

}

if c == nil {

c = sel.FirstChild

if c == nil {

break

}

}

}

}

func (a *Article) cleanAttrs(sel *html.Node, attrs ...string) {

for _, attr := range attrs {

removeAttr(sel, attr)

}

for c := sel.FirstChild; c != nil; c = c.NextSibling {

a.cleanAttrs(c, attrs...)

}

}

func (a *Article) GetContentNode() *html.Node {

return a.contentNode

}

一键复制

编辑

Web IDE

原始数据

按行查看

历史

 类似资料: