From f724f0f2a2f5a952f283e6023825742ff9086e2f Mon Sep 17 00:00:00 2001 From: alexpin Date: Tue, 25 Feb 2020 00:56:19 +0200 Subject: [PATCH] Download videos by hashtag; get json data without video downloading; limit option --- README.md | 2 + client/executeClientAction.go | 151 +++++++++++++------------- client/getHashtagUploads.go | 19 ++++ client/getMusicUploads.go | 14 ++- client/getUserUploads.go | 10 +- models/config/config.go | 77 +++++++------ scraper.js | 30 +++-- utils/getHashtag.go | 16 +++ workflows/downloadHashtag.go | 36 ++++++ workflows/downloadMusic.go | 39 ++++--- workflows/downloadUser.go | 5 + workflows/downloadVideo.go | 50 ++++----- workflows/startWorkflowByParameter.go | 24 +++- 13 files changed, 308 insertions(+), 165 deletions(-) create mode 100644 client/getHashtagUploads.go create mode 100644 utils/getHashtag.go create mode 100644 workflows/downloadHashtag.go diff --git a/README.md b/README.md index 3b66933..4c0b208 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ Clone this repository and run `go build` to build the executable. * `-batch-file` - File containing URLs/Usernames to download, one value per line. Lines starting with '#', are considered as comments and ignored. * `-deadline` - Sets the timout for scraper logic in seconds (used as a workaround for context deadline exceeded error) (default 1500) * `-quiet` - Supress output +* `-json` - Returns whole data, that was scraped from TikTok, in json +* `-limit` - Sets the max count of video that will be downloaded (default infinity) ## Acknowledgments This software uses the **chromedp** for web scraping, it can be found here: https://github.com/chromedp/chromedp \ diff --git a/client/executeClientAction.go b/client/executeClientAction.go index 74fee36..6188103 100644 --- a/client/executeClientAction.go +++ b/client/executeClientAction.go @@ -1,101 +1,102 @@ package client import ( - "context" - "github.com/chromedp/chromedp" - "io/ioutil" - "log" - "os" - "time" + "context" + "github.com/chromedp/chromedp" + "io/ioutil" + "log" + "os" + "time" - config "../models/config" - utils "../utils" + config "../models/config" + utils "../utils" ) // GetMusicUploads - Get all uploads by given music func executeClientAction(url string, jsAction string) string { - dir, err := ioutil.TempDir("", "chromedp-example") - utils.CheckErr(err) - defer os.RemoveAll(dir) + dir, err := ioutil.TempDir("", "chromedp-example") + utils.CheckErr(err) + defer os.RemoveAll(dir) - opts := append(chromedp.DefaultExecAllocatorOptions[:], - chromedp.DisableGPU, - chromedp.UserDataDir(dir), - chromedp.Flag("headless", !config.Config.Debug), - ) + opts := append(chromedp.DefaultExecAllocatorOptions[:], + chromedp.DisableGPU, + chromedp.UserDataDir(dir), + chromedp.Flag("headless", !config.Config.Debug), + ) - allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) - defer cancel() + allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) + defer cancel() - ctx, cancel := chromedp.NewContext( - allocCtx, - chromedp.WithLogf(log.Printf), - ) - defer cancel() + ctx, cancel := chromedp.NewContext( + allocCtx, + chromedp.WithLogf(log.Printf), + ) + defer cancel() - ctx, cancel = context.WithTimeout(ctx, time.Duration(config.Config.Deadline)*time.Second) - defer cancel() + ctx, cancel = context.WithTimeout(ctx, time.Duration(config.Config.Deadline)*time.Second) + defer cancel() - var jsOutput string - jsOutput = runScrapeWithInfo(ctx, jsAction, url) + var jsOutput string + jsOutput = runScrapeWithInfo(ctx, jsAction, url) - return jsOutput + return jsOutput } func runScrapeQuiet(ctx context.Context, jsAction string, url string) string { - var jsOutput string - err := chromedp.Run(ctx, - // Navigate to user's page - chromedp.Navigate(url), - // Execute url grabber script - chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput), - chromedp.EvaluateAsDevTools(jsAction, &jsOutput), - // Wait until custom js finishes - chromedp.WaitVisible(`video_urls`), - // Grab url links from our element - chromedp.InnerHTML(`video_urls`, &jsOutput), - ) - utils.CheckErr(err) - return jsOutput + var jsOutput string + err := chromedp.Run(ctx, + // Navigate to user's page + chromedp.Navigate(url), + // Execute url grabber script + chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput), + chromedp.EvaluateAsDevTools(jsAction, &jsOutput), + // Wait until custom js finishes + chromedp.WaitVisible(`video_urls`), + // Grab url links from our element + chromedp.InnerHTML(`video_urls`, &jsOutput), + ) + utils.CheckErr(err) + return jsOutput } func runScrapeWithInfo(ctx context.Context, jsAction string, url string) string { - var jsOutput string - err := chromedp.Run(ctx, - // Navigate to user's page - chromedp.Navigate(url), - // Execute url grabber script - chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput), - chromedp.EvaluateAsDevTools(jsAction, &jsOutput), - ) - utils.CheckErr(err) + var jsOutput string + err := chromedp.Run(ctx, + // Navigate to user's page + chromedp.Navigate(url), + // Execute url grabber script + chromedp.WaitReady("video"), + chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput), + chromedp.EvaluateAsDevTools(jsAction, &jsOutput), + ) + utils.CheckErr(err) - for { - err = chromedp.Run(ctx, chromedp.EvaluateAsDevTools("currentState.preloadCount.toString()", &jsOutput)) - utils.CheckErr(err) - if jsOutput != "0" { - utils.Logf("\rPreloading... Currently loaded %s items.", jsOutput) - } else { - utils.Logf("\rPreloading...") - } + for { + err = chromedp.Run(ctx, chromedp.EvaluateAsDevTools("currentState.preloadCount.toString()", &jsOutput)) + utils.CheckErr(err) + if jsOutput != "0" { + utils.Logf("\rPreloading... %s items have been founded.", jsOutput) + } else { + utils.Logf("\rPreloading...") + } - err = chromedp.Run(ctx, chromedp.EvaluateAsDevTools("currentState.finished.toString()", &jsOutput)) - utils.CheckErr(err) - if jsOutput == "true" { - break - } + err = chromedp.Run(ctx, chromedp.EvaluateAsDevTools("currentState.finished.toString()", &jsOutput)) + utils.CheckErr(err) + if jsOutput == "true" { + break + } - time.Sleep(50 * time.Millisecond) - } + time.Sleep(50 * time.Millisecond) + } - utils.Log("\nRetrieving items...") - err = chromedp.Run(ctx, - // Wait until custom js finishes - chromedp.WaitVisible(`video_urls`), - // Grab url links from our element - chromedp.InnerHTML(`video_urls`, &jsOutput), - ) - utils.CheckErr(err) + utils.Log("\nRetrieving items...") + err = chromedp.Run(ctx, + // Wait until custom js finishes + chromedp.WaitVisible(`video_urls`), + // Grab url links from our element + chromedp.InnerHTML(`video_urls`, &jsOutput), + ) + utils.CheckErr(err) - return jsOutput + return jsOutput } diff --git a/client/getHashtagUploads.go b/client/getHashtagUploads.go new file mode 100644 index 0000000..b3bd10c --- /dev/null +++ b/client/getHashtagUploads.go @@ -0,0 +1,19 @@ +package client + +import ( + models "../models" + config "../models/config" + "fmt" +) + +// GetUserUploads - Get all uploads marked with given hashtag +func GetHashtagUploads(hashtagURL string) []models.Upload { + jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit) + actionOutput := executeClientAction(hashtagURL, jsMethod) + return models.ParseUploads(actionOutput) +} + +func GetHashtagUploadsJson(hashtagURL string) string { + jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit) + return executeClientAction(hashtagURL, jsMethod) +} \ No newline at end of file diff --git a/client/getMusicUploads.go b/client/getMusicUploads.go index 4648940..5880fa0 100644 --- a/client/getMusicUploads.go +++ b/client/getMusicUploads.go @@ -1,11 +1,19 @@ package client import ( - models "../models" + models "../models" + config "../models/config" + "fmt" ) // GetMusicUploads - Get all uploads by given music func GetMusicUploads(url string) []models.Upload { - actionOutput := executeClientAction(url, "bootstrapIteratingVideos()") - return models.ParseUploads(actionOutput) + jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit) + actionOutput := executeClientAction(url, jsMethod) + return models.ParseUploads(actionOutput) } + +func GetMusicUploadsJson(url string) string { + jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit) + return executeClientAction(url, jsMethod) +} \ No newline at end of file diff --git a/client/getUserUploads.go b/client/getUserUploads.go index f68d7b1..7f38567 100644 --- a/client/getUserUploads.go +++ b/client/getUserUploads.go @@ -2,10 +2,18 @@ package client import ( models "../models" + config "../models/config" + "fmt" ) // GetUserUploads - Get all uploads by user func GetUserUploads(username string) []models.Upload { - actionOutput := executeClientAction(`https://www.tiktok.com/@`+username, "bootstrapIteratingVideos()") + jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit) + actionOutput := executeClientAction(`https://www.tiktok.com/@`+username, jsMethod) return models.ParseUploads(actionOutput) } + +func GetUserUploadsJson(username string) string { + jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit) + return executeClientAction(`https://www.tiktok.com/@`+username, jsMethod) +} \ No newline at end of file diff --git a/models/config/config.go b/models/config/config.go index 773c9ca..7495896 100644 --- a/models/config/config.go +++ b/models/config/config.go @@ -1,48 +1,57 @@ package config import ( - "flag" - "fmt" - "os" + "flag" + "fmt" + "os" ) // Config - Runtime configuration var Config struct { - URL string - OutputPath string - BatchFilePath string - Debug bool - MetaData bool - Quiet bool - Deadline int + URL string + OutputPath string + BatchFilePath string + Debug bool + MetaData bool + Quiet bool + Deadline int + Limit int + JSONOnly bool } // GetConfig - Returns Config object func GetConfig() { - outputPath := flag.String("output", "./downloads", "Output path") - batchFilePath := flag.String("batch-file", "", "File containing URLs/Usernames to download, one value per line. Lines starting with '#', are considered as comments and ignored.") - debug := flag.Bool("debug", false, "Enables debug mode") - metadata := flag.Bool("metadata", false, "Write video metadata to a .json file") - quiet := flag.Bool("quiet", false, "Supress output") - deadline := flag.Int("deadline", 1500, "Sets the timout for scraper logic in seconds (used as a workaround for 'context deadline exceeded' error)") - flag.Parse() + outputPath := flag.String("output", "./downloads", "Output path") + batchFilePath := flag.String("batch-file", "", "File containing URLs/Usernames to download, one value per line. Lines starting with '#', are considered as comments and ignored.") + debug := flag.Bool("debug", false, "Enables debug mode") + metadata := flag.Bool("metadata", false, "Write video metadata to a .json file") + quiet := flag.Bool("quiet", false, "Supress output") + deadline := flag.Int("deadline", 1500, "Sets the timout for scraper logic in seconds (used as a workaround for 'context deadline exceeded' error)") + limit := flag.Int("limit", 0, "Sets the videos count limit (useful when there too many videos from the user or by hashtag)") + jsonOnly := flag.Bool("json", false, "Just get JSON data from scraper (without video downloading)") + flag.Parse() - args := flag.Args() - if len(args) < 1 && *batchFilePath == "" { - fmt.Println("Usage: tiktok-dl [OPTIONS] TIKTOK_USERNAME|TIKTOK_URL") - fmt.Println(" or: tiktok-dl [OPTIONS] -batch-file path/to/users.txt") - os.Exit(2) - } + args := flag.Args() + if len(args) < 1 && *batchFilePath == "" { + fmt.Println("Usage: tiktok-dl [OPTIONS] TIKTOK_USERNAME|TIKTOK_URL") + fmt.Println(" or: tiktok-dl [OPTIONS] -batch-file path/to/users.txt") + os.Exit(2) + } - if len(args) > 0 { - Config.URL = flag.Args()[len(args)-1] - } else { - Config.URL = "" - } - Config.OutputPath = *outputPath - Config.BatchFilePath = *batchFilePath - Config.Debug = *debug - Config.MetaData = *metadata - Config.Quiet = *quiet - Config.Deadline = *deadline + if len(args) > 0 { + Config.URL = flag.Args()[len(args)-1] + } else { + Config.URL = "" + } + Config.OutputPath = *outputPath + Config.BatchFilePath = *batchFilePath + Config.Debug = *debug + Config.MetaData = *metadata + Config.Quiet = *quiet + if *jsonOnly { + Config.Quiet = true + } + Config.Deadline = *deadline + Config.Limit = *limit + Config.JSONOnly = *jsonOnly; } diff --git a/scraper.js b/scraper.js index e45a9b6..740cb06 100644 --- a/scraper.js +++ b/scraper.js @@ -1,7 +1,7 @@ optStrings = { selectors: { feedLoading: 'div.tiktok-loading.feed-loading', - modalArrowLeft: 'div.video-card-modal > div > img.arrow-right', + modalArrowRight: 'div.video-card-modal > div > img.arrow-right', modalClose: '.video-card-modal > div > div.close', modalPlayer: 'div > div > main > div.video-card-modal > div > div.video-card-big > div.video-card-container > div > div > video', modalShareInput: '.copy-link-container > input', @@ -30,6 +30,7 @@ optStrings = { currentState = { preloadCount: 0, finished: false, + limit: 100 }; createVidUrlElement = function(outputObj) { @@ -37,7 +38,7 @@ createVidUrlElement = function(outputObj) { urlSetElement.innerText = JSON.stringify(outputObj); document.getElementsByTagName(optStrings.tags.resultParentTag)[0].appendChild(urlSetElement); currentState.finished = true; -} +}; buldVidUrlArray = function(finishCallback) { var feedItem = document.getElementsByClassName(optStrings.classes.feedVideoItem)[0]; @@ -46,8 +47,14 @@ buldVidUrlArray = function(finishCallback) { var videoArray = []; var intervalID = window.setInterval(x => { videoArray.push(getCurrentModalVideo()); - - var arrowRight = document.querySelectorAll(optStrings.selectors.modalArrowLeft)[0]; + if(currentState.limit > 0) { + if (videoArray.length >= currentState.limit) { + window.clearInterval(intervalID); + document.querySelector(optStrings.selectors.modalClose).click(); + finishCallback(videoArray); + } + } + var arrowRight = document.querySelectorAll(optStrings.selectors.modalArrowRight)[0]; if (arrowRight.classList.contains(optStrings.classes.modalCloseDisabled)) { window.clearInterval(intervalID); document.querySelector(optStrings.selectors.modalClose).click(); @@ -78,7 +85,7 @@ getCurrentModalVideo = function() { link: soundHref, }, }; -} +}; getCurrentVideo = function() { var player = document.querySelector(optStrings.selectors.videoPlayer); @@ -100,13 +107,19 @@ getCurrentVideo = function() { link: soundHref, }, }; -} +}; scrollWhileNew = function(finishCallback) { var state = { count: 0 }; var intervalID = window.setInterval(x => { var oldCount = state.count; state.count = document.getElementsByClassName(optStrings.classes.feedVideoItem).length; + if(currentState.limit > 0) { + if (currentState.preloadCount >= currentState.limit || state.count >= currentState.limit) { + finishCallback(createVidUrlElement); + window.clearInterval(intervalID); + } + } if (oldCount !== state.count) { currentState.preloadCount = state.count; window.scrollTo(0, document.body.scrollHeight); @@ -121,7 +134,8 @@ scrollWhileNew = function(finishCallback) { }, 1000); }; -bootstrapIteratingVideos = function() { +bootstrapIteratingVideos = function(limit) { + currentState.limit = limit; scrollWhileNew(buldVidUrlArray); return 'bootstrapIteratingVideos'; }; @@ -130,7 +144,7 @@ bootstrapGetCurrentVideo = function() { var video = getCurrentVideo(); createVidUrlElement(video); return 'bootstrapGetCurrentVideo'; -} +}; init = () => { const newProto = navigator.__proto__; diff --git a/utils/getHashtag.go b/utils/getHashtag.go new file mode 100644 index 0000000..d9c8ba3 --- /dev/null +++ b/utils/getHashtag.go @@ -0,0 +1,16 @@ +package utils + +import ( + res "../resources" + "fmt" + "strings" +) + +// GetHashtagFromURL - Get's tag name from passed url +func GetHashtagFromURL(str string) string { + if match := strings.Contains(str, "/tag/"); match { + return strings.Split(str, "/tag/")[1] + } + + panic(fmt.Sprintf(res.ErrorCouldNotRecogniseURL, str)) +} diff --git a/workflows/downloadHashtag.go b/workflows/downloadHashtag.go new file mode 100644 index 0000000..a5332bf --- /dev/null +++ b/workflows/downloadHashtag.go @@ -0,0 +1,36 @@ +package workflows + +import ( + client "../client" + config "../models/config" + utils "../utils" + "fmt" + "strings" +) + +// CanUseDownloadHashtag - Test's if this workflow can be used for parameter +func CanUseDownloadHashtag(url string) bool { + match := strings.Contains(url, "/tag/") + return match +} + +// DownloadHashtag - Download videos marked with given hashtag +func DownloadHashtag(url string) { + uploads := client.GetHashtagUploads(url) + uploadCount := len(uploads) + hashtag := utils.GetHashtagFromURL(url) + downloadDir := fmt.Sprintf("%s/%s", config.Config.OutputPath, hashtag) + + utils.InitOutputDirectory(downloadDir) + + for index, upload := range uploads { + downloadVideo(upload, downloadDir) + utils.Logf("\r[%d/%d] Downloaded", index+1, uploadCount) + } + utils.Log() +} + +func GetHashtagJson(url string) { + uploads := client.GetHashtagUploads(url) + fmt.Printf("%s", uploads) +} diff --git a/workflows/downloadMusic.go b/workflows/downloadMusic.go index 8fecd37..d6e2c8f 100644 --- a/workflows/downloadMusic.go +++ b/workflows/downloadMusic.go @@ -1,31 +1,36 @@ package workflows import ( - client "../client" - config "../models/config" - utils "../utils" - "fmt" - "regexp" + client "../client" + config "../models/config" + utils "../utils" + "fmt" + "regexp" ) // CanUseDownloadMusic - Check's if DownloadMusic can be used for parameter func CanUseDownloadMusic(url string) bool { - match, _ := regexp.MatchString(".com\\/music\\/.+", url) - return match + match, _ := regexp.MatchString(".com\\/music\\/.+", url) + return match } // DownloadMusic - Download all videos by given music func DownloadMusic(url string) { - uploads := client.GetMusicUploads(url) - uploadCount := len(uploads) + uploads := client.GetMusicUploads(url) + uploadCount := len(uploads) - for index, upload := range uploads { - username := utils.GetUsernameFromString(upload.Uploader) - downloadDir := fmt.Sprintf("%s/%s", config.Config.OutputPath, username) + for index, upload := range uploads { + username := utils.GetUsernameFromString(upload.Uploader) + downloadDir := fmt.Sprintf("%s/%s", config.Config.OutputPath, username) - utils.InitOutputDirectory(downloadDir) - downloadVideo(upload, downloadDir) - utils.Logf("\r[%d/%d] Downloaded", index+1, uploadCount) - } - utils.Log() + utils.InitOutputDirectory(downloadDir) + downloadVideo(upload, downloadDir) + utils.Logf("\r[%d/%d] Downloaded", index+1, uploadCount) + } + utils.Log() +} + +func GetMusicJson(url string) { + uploads := client.GetMusicUploadsJson(url) + fmt.Printf("%s", uploads) } diff --git a/workflows/downloadUser.go b/workflows/downloadUser.go index 980921c..cd73373 100644 --- a/workflows/downloadUser.go +++ b/workflows/downloadUser.go @@ -28,3 +28,8 @@ func DownloadUser(username string) { } utils.Log() } + +func GetUserVideosJson(username string) { + uploads := client.GetUserUploadsJson(username) + fmt.Printf("%s", uploads) +} diff --git a/workflows/downloadVideo.go b/workflows/downloadVideo.go index e538db1..990d330 100644 --- a/workflows/downloadVideo.go +++ b/workflows/downloadVideo.go @@ -1,44 +1,44 @@ package workflows import ( - client "../client" - models "../models" - config "../models/config" - utils "../utils" - "fmt" - "regexp" + client "../client" + models "../models" + config "../models/config" + utils "../utils" + "fmt" + "regexp" ) // CanUseDownloadSingleVideo - Check's if DownloadSingleVideo can be used for parameter func CanUseDownloadSingleVideo(url string) bool { - match, _ := regexp.MatchString("\\/@.+\\/video\\/[0-9]+", url) - return match + match, _ := regexp.MatchString("\\/@.+\\/video\\/[0-9]+", url) + return match } // DownloadSingleVideo - Downloads single video func DownloadSingleVideo(url string) { - username := utils.GetUsernameFromString(url) - upload := client.GetVideoDetails(url) - downloadDir := fmt.Sprintf("%s/%s", config.Config.OutputPath, username) + username := utils.GetUsernameFromString(url) + upload := client.GetVideoDetails(url) + downloadDir := fmt.Sprintf("%s/%s", config.Config.OutputPath, username) - utils.InitOutputDirectory(downloadDir) - downloadVideo(upload, downloadDir) - utils.Log("[1/1] Downloaded\n") + utils.InitOutputDirectory(downloadDir) + downloadVideo(upload, downloadDir) + utils.Log("[1/1] Downloaded\n") } // DownloadVideo - Downloads one video func downloadVideo(upload models.Upload, downloadDir string) { - uploadID := upload.GetUploadID() - downloadPath := fmt.Sprintf("%s/%s.mp4", downloadDir, uploadID) + uploadID := upload.GetUploadID() + downloadPath := fmt.Sprintf("%s/%s.mp4", downloadDir, uploadID) - if utils.CheckIfExists(downloadPath) { - return - } + if utils.CheckIfExists(downloadPath) { + return + } - utils.DownloadFile(downloadPath, upload.URL) + utils.DownloadFile(downloadPath, upload.URL) - if config.Config.MetaData { - metadataPath := fmt.Sprintf("%s/%s.json", downloadDir, uploadID) - upload.WriteToFile(metadataPath) - } -} + if config.Config.MetaData { + metadataPath := fmt.Sprintf("%s/%s.json", downloadDir, uploadID) + upload.WriteToFile(metadataPath) + } +} \ No newline at end of file diff --git a/workflows/startWorkflowByParameter.go b/workflows/startWorkflowByParameter.go index f3af3df..6e5a6f8 100644 --- a/workflows/startWorkflowByParameter.go +++ b/workflows/startWorkflowByParameter.go @@ -1,6 +1,7 @@ package workflows import ( + config "../models/config" res "../resources" utils "../utils" ) @@ -10,7 +11,11 @@ func StartWorkflowByParameter(url string) { // Music if CanUseDownloadMusic(url) { - DownloadMusic(url) + if config.Config.JSONOnly { + GetMusicJson(url) + } else { + DownloadMusic(url) + } return } @@ -22,7 +27,22 @@ func StartWorkflowByParameter(url string) { // Tiktok user if CanUseDownloadUser(url) { - DownloadUser(utils.GetUsernameFromString(url)) + if config.Config.JSONOnly { + GetUserVideosJson(utils.GetUsernameFromString(url)) + } else { + DownloadUser(utils.GetUsernameFromString(url)) + } + + return + } + + // Tiktok hashtag + if CanUseDownloadHashtag(url) { + if config.Config.JSONOnly { + GetHashtagJson(url) + } else { + DownloadHashtag(url) + } return }