diff --git a/README.md b/README.md index e0eec10..dd21110 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Clone this repository and run `go build` to build the executable. * `-output some_directory` - Output path (default "./downloads") * `-metadata` - Write video metadata to a .json file * `-batch-file` - File containing URLs/Usernames to download, one value per line. Lines starting with '#', are considered as comments and ignored. +* `-deadline` - Sets the timout for scraper logic in seconds (used as a workaround for context deadline exceeded error) (default 1500) ## Acknowledgments This software uses the **chromedp** for web scraping, it can be found here: https://github.com/chromedp/chromedp \ diff --git a/client/executeClientAction.go b/client/executeClientAction.go new file mode 100644 index 0000000..98745b3 --- /dev/null +++ b/client/executeClientAction.go @@ -0,0 +1,58 @@ +package client + +import ( + "context" + "github.com/chromedp/chromedp" + "io/ioutil" + "log" + "os" + "time" + + models "../models" + utils "../utils" +) + +// GetMusicUploads - Get all uploads by given music +func executeClientAction(url string, jsAction string) string { + dir, err := ioutil.TempDir("", "chromedp-example") + if err != nil { + panic(err) + } + defer os.RemoveAll(dir) + + opts := append(chromedp.DefaultExecAllocatorOptions[:], + chromedp.DisableGPU, + chromedp.UserDataDir(dir), + chromedp.Flag("headless", !models.Config.Debug), + ) + + allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) + defer cancel() + + ctx, cancel := chromedp.NewContext( + allocCtx, + chromedp.WithLogf(log.Printf), + ) + defer cancel() + + ctx, cancel = context.WithTimeout(ctx, time.Duration(models.Config.Deadline)*time.Second) + defer cancel() + + var jsOutput string + err = chromedp.Run(ctx, + // Navigate to user's page + chromedp.Navigate(url), + // Execute url grabber script + chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput), + chromedp.EvaluateAsDevTools(jsAction, &jsOutput), + // Wait until custom js finishes + chromedp.WaitVisible(`video_urls`), + // Grab url links from our element + chromedp.InnerHTML(`video_urls`, &jsOutput), + ) + if err != nil { + log.Fatal(err) + } + + return jsOutput +} diff --git a/client/getMusicUploads.go b/client/getMusicUploads.go index 1ab1977..4648940 100644 --- a/client/getMusicUploads.go +++ b/client/getMusicUploads.go @@ -1,58 +1,11 @@ package client import ( - "context" - "github.com/chromedp/chromedp" - "io/ioutil" - "log" - "os" - "time" - models "../models" - utils "../utils" ) // GetMusicUploads - Get all uploads by given music func GetMusicUploads(url string) []models.Upload { - dir, err := ioutil.TempDir("", "chromedp-example") - if err != nil { - panic(err) - } - defer os.RemoveAll(dir) - - opts := append(chromedp.DefaultExecAllocatorOptions[:], - chromedp.DisableGPU, - chromedp.UserDataDir(dir), - chromedp.Flag("headless", !models.Config.Debug), - ) - - allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) - defer cancel() - - ctx, cancel := chromedp.NewContext( - allocCtx, - chromedp.WithLogf(log.Printf), - ) - defer cancel() - - ctx, cancel = context.WithTimeout(ctx, 1500*time.Second) - defer cancel() - - var jsOutput string - err = chromedp.Run(ctx, - // Navigate to user's page - chromedp.Navigate(url), - // Execute url grabber script - chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput), - chromedp.EvaluateAsDevTools("bootstrapIteratingVideos()", &jsOutput), - // Wait until custom js finishes - chromedp.WaitVisible(`video_urls`), - // Grab url links from our element - chromedp.InnerHTML(`video_urls`, &jsOutput), - ) - if err != nil { - log.Fatal(err) - } - - return models.ParseUploads(jsOutput) + actionOutput := executeClientAction(url, "bootstrapIteratingVideos()") + return models.ParseUploads(actionOutput) } diff --git a/client/getUserUploads.go b/client/getUserUploads.go index 7e37f09..f68d7b1 100644 --- a/client/getUserUploads.go +++ b/client/getUserUploads.go @@ -1,58 +1,11 @@ package client import ( - "context" - "github.com/chromedp/chromedp" - "io/ioutil" - "log" - "os" - "time" - models "../models" - utils "../utils" ) // GetUserUploads - Get all uploads by user func GetUserUploads(username string) []models.Upload { - dir, err := ioutil.TempDir("", "chromedp-example") - if err != nil { - panic(err) - } - defer os.RemoveAll(dir) - - opts := append(chromedp.DefaultExecAllocatorOptions[:], - chromedp.DisableGPU, - chromedp.UserDataDir(dir), - chromedp.Flag("headless", !models.Config.Debug), - ) - - allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) - defer cancel() - - ctx, cancel := chromedp.NewContext( - allocCtx, - chromedp.WithLogf(log.Printf), - ) - defer cancel() - - ctx, cancel = context.WithTimeout(ctx, 1500*time.Second) - defer cancel() - - var jsOutput string - err = chromedp.Run(ctx, - // Navigate to user's page - chromedp.Navigate(`https://www.tiktok.com/@`+username), - // Execute url grabber script - chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput), - chromedp.EvaluateAsDevTools("bootstrapIteratingVideos()", &jsOutput), - // Wait until custom js finishes - chromedp.WaitVisible(`video_urls`), - // Grab url links from our element - chromedp.InnerHTML(`video_urls`, &jsOutput), - ) - if err != nil { - log.Fatal(err) - } - - return models.ParseUploads(jsOutput) + actionOutput := executeClientAction(`https://www.tiktok.com/@`+username, "bootstrapIteratingVideos()") + return models.ParseUploads(actionOutput) } diff --git a/client/getVideoDetails.go b/client/getVideoDetails.go index 1bb5af1..20a0d5d 100644 --- a/client/getVideoDetails.go +++ b/client/getVideoDetails.go @@ -1,58 +1,11 @@ package client import ( - "context" - "github.com/chromedp/chromedp" - "io/ioutil" - "log" - "os" - "time" - models "../models" - utils "../utils" ) // GetVideoDetails - returns details of video func GetVideoDetails(videoURL string) models.Upload { - dir, err := ioutil.TempDir("", "chromedp-example") - if err != nil { - panic(err) - } - defer os.RemoveAll(dir) - - opts := append(chromedp.DefaultExecAllocatorOptions[:], - chromedp.DisableGPU, - chromedp.UserDataDir(dir), - chromedp.Flag("headless", !models.Config.Debug), - ) - - allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) - defer cancel() - - ctx, cancel := chromedp.NewContext( - allocCtx, - chromedp.WithLogf(log.Printf), - ) - defer cancel() - - ctx, cancel = context.WithTimeout(ctx, 1500*time.Second) - defer cancel() - - var jsOutput string - err = chromedp.Run(ctx, - // Navigate to user's page - chromedp.Navigate(videoURL), - // Execute url grabber script - chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput), - chromedp.EvaluateAsDevTools("bootstrapGetCurrentVideo()", &jsOutput), - // Wait until custom js finishes - chromedp.WaitVisible(`video_urls`), - // Grab url links from our element - chromedp.InnerHTML(`video_urls`, &jsOutput), - ) - if err != nil { - log.Fatal(err) - } - - return models.ParseUpload(jsOutput) + actionOutput := executeClientAction(videoURL, "bootstrapGetCurrentVideo()") + return models.ParseUpload(actionOutput) } diff --git a/models/config.go b/models/config.go index 0b68cb1..40fc4dc 100644 --- a/models/config.go +++ b/models/config.go @@ -15,6 +15,7 @@ var Config struct { BatchFilePath string Debug bool MetaData bool + Deadline int } // GetConfig - Returns Config object @@ -23,6 +24,7 @@ func GetConfig() { batchFilePath := flag.String("batch-file", "", "File containing URLs/Usernames to download, one value per line. Lines starting with '#', are considered as comments and ignored.") debug := flag.Bool("debug", false, "Enables debug mode") metadata := flag.Bool("metadata", false, "Write video metadata to a .json file") + deadline := flag.Int("deadline", 1500, "Sets the timout for scraper logic in seconds (used as a workaround for 'context deadline exceeded' error)") flag.Parse() args := flag.Args() @@ -41,6 +43,7 @@ func GetConfig() { Config.BatchFilePath = *batchFilePath Config.Debug = *debug Config.MetaData = *metadata + Config.Deadline = *deadline } // GetUsername - Get's username from passed URL param