Added html parser

This commit is contained in:
Pijus Kamandulis 2021-06-08 23:43:32 +03:00
parent d3cf54472a
commit 01f7879f7e
7 changed files with 74 additions and 5 deletions

View File

@ -9,10 +9,10 @@ jobs:
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
steps: steps:
- name: Set up Go 1.13 - name: Set up Go 1.16
uses: actions/setup-go@v1 uses: actions/setup-go@v1
with: with:
go-version: 1.13 go-version: 1.16
id: go id: go
- name: Check out code into the Go module directory - name: Check out code into the Go module directory

View File

@ -11,6 +11,7 @@ You can download feeds by running `./rss-dl [Options] FEED_URL`
## Available options ## Available options
* `-output some_directory` - Output path (default ".") * `-output some_directory` - Output path (default ".")
* `-parsehtml` - Save content as html
## Acknowledgments ## Acknowledgments
This software uses the gofeed parser which can be found here: https://github.com/mmcdole/gofeed This software uses the gofeed parser which can be found here: https://github.com/mmcdole/gofeed

6
go.mod
View File

@ -2,4 +2,8 @@ module github.com/pikami/rss-dl
go 1.16 go 1.16
require github.com/mmcdole/gofeed v1.1.3 require (
github.com/PuerkitoBio/goquery v1.6.1 // indirect
github.com/mmcdole/gofeed v1.1.3
golang.org/x/net v0.0.0-20200301022130-244492dfa37a // indirect
)

2
go.sum
View File

@ -1,6 +1,8 @@
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFGCWpk=
github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=

52
htmlgrab/htmlgrab.go Normal file
View File

@ -0,0 +1,52 @@
package htmlparse
import (
"fmt"
"path/filepath"
"strings"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
fileio "github.com/pikami/rss-dl/fileio"
helpers "github.com/pikami/rss-dl/helpers"
)
func HtmlGrab(htmlStr string, itemOutputDir string) {
rootNode, err := html.Parse(strings.NewReader(htmlStr))
if err != nil {
return
}
// Init download dir
outputDir := itemOutputDir + "/html"
fileio.InitOutputDirectory(outputDir)
// Load the HTML document
doc := goquery.NewDocumentFromNode(rootNode)
// Download assets
doc.Find("img").Each(func(i int, s *goquery.Selection) {
// For each item found, get the title
val, exists := s.Attr("src")
if exists {
imageName := helpers.RemoveGetParams(filepath.Base(val))
itemImagePath := outputDir + "/" + imageName
helpers.LogInfo("Downloading image to " + itemImagePath)
fileio.DownloadFile(
itemImagePath,
val)
fmt.Printf("[htmlgrab] %d: %s\n", i, val)
s.SetAttr("src", imageName)
}
})
newHtml, err := doc.Html()
if err != nil {
return
}
fileio.WriteToFile(outputDir+"/index.html", newHtml)
}

View File

@ -9,6 +9,7 @@ import (
fileio "github.com/pikami/rss-dl/fileio" fileio "github.com/pikami/rss-dl/fileio"
helpers "github.com/pikami/rss-dl/helpers" helpers "github.com/pikami/rss-dl/helpers"
htmlgrab "github.com/pikami/rss-dl/htmlgrab"
structs "github.com/pikami/rss-dl/structs" structs "github.com/pikami/rss-dl/structs"
) )
@ -26,8 +27,10 @@ func main() {
helpers.LogInfo("Writing feed details as JSON to " + feedInfoPath) helpers.LogInfo("Writing feed details as JSON to " + feedInfoPath)
fileio.WriteToFile(feedInfoPath, GrabFeedDetailsJSON(feed)) fileio.WriteToFile(feedInfoPath, GrabFeedDetailsJSON(feed))
if feed.Image != nil {
feedImagePath := outputDir + "/image" + helpers.RemoveGetParams(filepath.Ext(feed.Image.URL)) feedImagePath := outputDir + "/image" + helpers.RemoveGetParams(filepath.Ext(feed.Image.URL))
fileio.DownloadFile(feedImagePath, feed.Image.URL) fileio.DownloadFile(feedImagePath, feed.Image.URL)
}
for _, item := range feed.Items { for _, item := range feed.Items {
itemOutputFilename := helpers.ToCleanString( itemOutputFilename := helpers.ToCleanString(
@ -63,6 +66,10 @@ func main() {
itemOutputDir+"/"+filename, itemOutputDir+"/"+filename,
enclosure.URL) enclosure.URL)
} }
if structs.Config.ParseHtml {
htmlgrab.HtmlGrab(item.Content, itemOutputDir)
}
} }
} }

View File

@ -10,11 +10,13 @@ import (
var Config struct { var Config struct {
FeedURL string FeedURL string
OutputPath string OutputPath string
ParseHtml bool
} }
// GetConfig - Returns Config object // GetConfig - Returns Config object
func GetConfig() { func GetConfig() {
outputPath := flag.String("output", ".", "Output path") outputPath := flag.String("output", ".", "Output path")
parseHtml := flag.Bool("parsehtml", false, "Save content as html")
flag.Parse() flag.Parse()
@ -26,4 +28,5 @@ func GetConfig() {
Config.FeedURL = flag.Args()[len(args)-1] Config.FeedURL = flag.Args()[len(args)-1]
Config.OutputPath = *outputPath Config.OutputPath = *outputPath
Config.ParseHtml = *parseHtml
} }