mirror of https://github.com/pikami/rss-dl.git
Added html parser
This commit is contained in:
parent
d3cf54472a
commit
01f7879f7e
|
@ -9,10 +9,10 @@ jobs:
|
|||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Set up Go 1.13
|
||||
- name: Set up Go 1.16
|
||||
uses: actions/setup-go@v1
|
||||
with:
|
||||
go-version: 1.13
|
||||
go-version: 1.16
|
||||
id: go
|
||||
|
||||
- name: Check out code into the Go module directory
|
||||
|
|
|
@ -11,6 +11,7 @@ You can download feeds by running `./rss-dl [Options] FEED_URL`
|
|||
|
||||
## Available options
|
||||
* `-output some_directory` - Output path (default ".")
|
||||
* `-parsehtml` - Save content as html
|
||||
|
||||
## Acknowledgments
|
||||
This software uses the gofeed parser which can be found here: https://github.com/mmcdole/gofeed
|
||||
|
|
6
go.mod
6
go.mod
|
@ -2,4 +2,8 @@ module github.com/pikami/rss-dl
|
|||
|
||||
go 1.16
|
||||
|
||||
require github.com/mmcdole/gofeed v1.1.3
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.6.1 // indirect
|
||||
github.com/mmcdole/gofeed v1.1.3
|
||||
golang.org/x/net v0.0.0-20200301022130-244492dfa37a // indirect
|
||||
)
|
||||
|
|
2
go.sum
2
go.sum
|
@ -1,6 +1,8 @@
|
|||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
||||
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
|
||||
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||
github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFGCWpk=
|
||||
github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
|
||||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
package htmlparse
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"golang.org/x/net/html"
|
||||
|
||||
fileio "github.com/pikami/rss-dl/fileio"
|
||||
helpers "github.com/pikami/rss-dl/helpers"
|
||||
)
|
||||
|
||||
func HtmlGrab(htmlStr string, itemOutputDir string) {
|
||||
rootNode, err := html.Parse(strings.NewReader(htmlStr))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Init download dir
|
||||
outputDir := itemOutputDir + "/html"
|
||||
fileio.InitOutputDirectory(outputDir)
|
||||
|
||||
// Load the HTML document
|
||||
doc := goquery.NewDocumentFromNode(rootNode)
|
||||
|
||||
// Download assets
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
// For each item found, get the title
|
||||
val, exists := s.Attr("src")
|
||||
if exists {
|
||||
imageName := helpers.RemoveGetParams(filepath.Base(val))
|
||||
itemImagePath := outputDir + "/" + imageName
|
||||
helpers.LogInfo("Downloading image to " + itemImagePath)
|
||||
fileio.DownloadFile(
|
||||
itemImagePath,
|
||||
val)
|
||||
|
||||
fmt.Printf("[htmlgrab] %d: %s\n", i, val)
|
||||
|
||||
s.SetAttr("src", imageName)
|
||||
}
|
||||
})
|
||||
|
||||
newHtml, err := doc.Html()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
fileio.WriteToFile(outputDir+"/index.html", newHtml)
|
||||
}
|
7
main.go
7
main.go
|
@ -9,6 +9,7 @@ import (
|
|||
|
||||
fileio "github.com/pikami/rss-dl/fileio"
|
||||
helpers "github.com/pikami/rss-dl/helpers"
|
||||
htmlgrab "github.com/pikami/rss-dl/htmlgrab"
|
||||
structs "github.com/pikami/rss-dl/structs"
|
||||
)
|
||||
|
||||
|
@ -26,8 +27,10 @@ func main() {
|
|||
helpers.LogInfo("Writing feed details as JSON to " + feedInfoPath)
|
||||
fileio.WriteToFile(feedInfoPath, GrabFeedDetailsJSON(feed))
|
||||
|
||||
if feed.Image != nil {
|
||||
feedImagePath := outputDir + "/image" + helpers.RemoveGetParams(filepath.Ext(feed.Image.URL))
|
||||
fileio.DownloadFile(feedImagePath, feed.Image.URL)
|
||||
}
|
||||
|
||||
for _, item := range feed.Items {
|
||||
itemOutputFilename := helpers.ToCleanString(
|
||||
|
@ -63,6 +66,10 @@ func main() {
|
|||
itemOutputDir+"/"+filename,
|
||||
enclosure.URL)
|
||||
}
|
||||
|
||||
if structs.Config.ParseHtml {
|
||||
htmlgrab.HtmlGrab(item.Content, itemOutputDir)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -10,11 +10,13 @@ import (
|
|||
var Config struct {
|
||||
FeedURL string
|
||||
OutputPath string
|
||||
ParseHtml bool
|
||||
}
|
||||
|
||||
// GetConfig - Returns Config object
|
||||
func GetConfig() {
|
||||
outputPath := flag.String("output", ".", "Output path")
|
||||
parseHtml := flag.Bool("parsehtml", false, "Save content as html")
|
||||
|
||||
flag.Parse()
|
||||
|
||||
|
@ -26,4 +28,5 @@ func GetConfig() {
|
|||
|
||||
Config.FeedURL = flag.Args()[len(args)-1]
|
||||
Config.OutputPath = *outputPath
|
||||
Config.ParseHtml = *parseHtml
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue