Download videos by hashtag; get json data without video downloading; limit option

This commit is contained in:
alexpin 2020-02-25 00:56:19 +02:00
parent 1b3f985f42
commit f724f0f2a2
13 changed files with 308 additions and 165 deletions

View File

@ -22,6 +22,8 @@ Clone this repository and run `go build` to build the executable.
* `-batch-file` - File containing URLs/Usernames to download, one value per line. Lines starting with '#', are considered as comments and ignored. * `-batch-file` - File containing URLs/Usernames to download, one value per line. Lines starting with '#', are considered as comments and ignored.
* `-deadline` - Sets the timout for scraper logic in seconds (used as a workaround for context deadline exceeded error) (default 1500) * `-deadline` - Sets the timout for scraper logic in seconds (used as a workaround for context deadline exceeded error) (default 1500)
* `-quiet` - Supress output * `-quiet` - Supress output
* `-json` - Returns whole data, that was scraped from TikTok, in json
* `-limit` - Sets the max count of video that will be downloaded (default infinity)
## Acknowledgments ## Acknowledgments
This software uses the **chromedp** for web scraping, it can be found here: https://github.com/chromedp/chromedp \ This software uses the **chromedp** for web scraping, it can be found here: https://github.com/chromedp/chromedp \

View File

@ -1,101 +1,102 @@
package client package client
import ( import (
"context" "context"
"github.com/chromedp/chromedp" "github.com/chromedp/chromedp"
"io/ioutil" "io/ioutil"
"log" "log"
"os" "os"
"time" "time"
config "../models/config" config "../models/config"
utils "../utils" utils "../utils"
) )
// GetMusicUploads - Get all uploads by given music // GetMusicUploads - Get all uploads by given music
func executeClientAction(url string, jsAction string) string { func executeClientAction(url string, jsAction string) string {
dir, err := ioutil.TempDir("", "chromedp-example") dir, err := ioutil.TempDir("", "chromedp-example")
utils.CheckErr(err) utils.CheckErr(err)
defer os.RemoveAll(dir) defer os.RemoveAll(dir)
opts := append(chromedp.DefaultExecAllocatorOptions[:], opts := append(chromedp.DefaultExecAllocatorOptions[:],
chromedp.DisableGPU, chromedp.DisableGPU,
chromedp.UserDataDir(dir), chromedp.UserDataDir(dir),
chromedp.Flag("headless", !config.Config.Debug), chromedp.Flag("headless", !config.Config.Debug),
) )
allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...)
defer cancel() defer cancel()
ctx, cancel := chromedp.NewContext( ctx, cancel := chromedp.NewContext(
allocCtx, allocCtx,
chromedp.WithLogf(log.Printf), chromedp.WithLogf(log.Printf),
) )
defer cancel() defer cancel()
ctx, cancel = context.WithTimeout(ctx, time.Duration(config.Config.Deadline)*time.Second) ctx, cancel = context.WithTimeout(ctx, time.Duration(config.Config.Deadline)*time.Second)
defer cancel() defer cancel()
var jsOutput string var jsOutput string
jsOutput = runScrapeWithInfo(ctx, jsAction, url) jsOutput = runScrapeWithInfo(ctx, jsAction, url)
return jsOutput return jsOutput
} }
func runScrapeQuiet(ctx context.Context, jsAction string, url string) string { func runScrapeQuiet(ctx context.Context, jsAction string, url string) string {
var jsOutput string var jsOutput string
err := chromedp.Run(ctx, err := chromedp.Run(ctx,
// Navigate to user's page // Navigate to user's page
chromedp.Navigate(url), chromedp.Navigate(url),
// Execute url grabber script // Execute url grabber script
chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput), chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput),
chromedp.EvaluateAsDevTools(jsAction, &jsOutput), chromedp.EvaluateAsDevTools(jsAction, &jsOutput),
// Wait until custom js finishes // Wait until custom js finishes
chromedp.WaitVisible(`video_urls`), chromedp.WaitVisible(`video_urls`),
// Grab url links from our element // Grab url links from our element
chromedp.InnerHTML(`video_urls`, &jsOutput), chromedp.InnerHTML(`video_urls`, &jsOutput),
) )
utils.CheckErr(err) utils.CheckErr(err)
return jsOutput return jsOutput
} }
func runScrapeWithInfo(ctx context.Context, jsAction string, url string) string { func runScrapeWithInfo(ctx context.Context, jsAction string, url string) string {
var jsOutput string var jsOutput string
err := chromedp.Run(ctx, err := chromedp.Run(ctx,
// Navigate to user's page // Navigate to user's page
chromedp.Navigate(url), chromedp.Navigate(url),
// Execute url grabber script // Execute url grabber script
chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput), chromedp.WaitReady("video"),
chromedp.EvaluateAsDevTools(jsAction, &jsOutput), chromedp.EvaluateAsDevTools(utils.ReadFileAsString("scraper.js"), &jsOutput),
) chromedp.EvaluateAsDevTools(jsAction, &jsOutput),
utils.CheckErr(err) )
utils.CheckErr(err)
for { for {
err = chromedp.Run(ctx, chromedp.EvaluateAsDevTools("currentState.preloadCount.toString()", &jsOutput)) err = chromedp.Run(ctx, chromedp.EvaluateAsDevTools("currentState.preloadCount.toString()", &jsOutput))
utils.CheckErr(err) utils.CheckErr(err)
if jsOutput != "0" { if jsOutput != "0" {
utils.Logf("\rPreloading... Currently loaded %s items.", jsOutput) utils.Logf("\rPreloading... %s items have been founded.", jsOutput)
} else { } else {
utils.Logf("\rPreloading...") utils.Logf("\rPreloading...")
} }
err = chromedp.Run(ctx, chromedp.EvaluateAsDevTools("currentState.finished.toString()", &jsOutput)) err = chromedp.Run(ctx, chromedp.EvaluateAsDevTools("currentState.finished.toString()", &jsOutput))
utils.CheckErr(err) utils.CheckErr(err)
if jsOutput == "true" { if jsOutput == "true" {
break break
} }
time.Sleep(50 * time.Millisecond) time.Sleep(50 * time.Millisecond)
} }
utils.Log("\nRetrieving items...") utils.Log("\nRetrieving items...")
err = chromedp.Run(ctx, err = chromedp.Run(ctx,
// Wait until custom js finishes // Wait until custom js finishes
chromedp.WaitVisible(`video_urls`), chromedp.WaitVisible(`video_urls`),
// Grab url links from our element // Grab url links from our element
chromedp.InnerHTML(`video_urls`, &jsOutput), chromedp.InnerHTML(`video_urls`, &jsOutput),
) )
utils.CheckErr(err) utils.CheckErr(err)
return jsOutput return jsOutput
} }

View File

@ -0,0 +1,19 @@
package client
import (
models "../models"
config "../models/config"
"fmt"
)
// GetUserUploads - Get all uploads marked with given hashtag
func GetHashtagUploads(hashtagURL string) []models.Upload {
jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit)
actionOutput := executeClientAction(hashtagURL, jsMethod)
return models.ParseUploads(actionOutput)
}
func GetHashtagUploadsJson(hashtagURL string) string {
jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit)
return executeClientAction(hashtagURL, jsMethod)
}

View File

@ -1,11 +1,19 @@
package client package client
import ( import (
models "../models" models "../models"
config "../models/config"
"fmt"
) )
// GetMusicUploads - Get all uploads by given music // GetMusicUploads - Get all uploads by given music
func GetMusicUploads(url string) []models.Upload { func GetMusicUploads(url string) []models.Upload {
actionOutput := executeClientAction(url, "bootstrapIteratingVideos()") jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit)
return models.ParseUploads(actionOutput) actionOutput := executeClientAction(url, jsMethod)
return models.ParseUploads(actionOutput)
}
func GetMusicUploadsJson(url string) string {
jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit)
return executeClientAction(url, jsMethod)
} }

View File

@ -2,10 +2,18 @@ package client
import ( import (
models "../models" models "../models"
config "../models/config"
"fmt"
) )
// GetUserUploads - Get all uploads by user // GetUserUploads - Get all uploads by user
func GetUserUploads(username string) []models.Upload { func GetUserUploads(username string) []models.Upload {
actionOutput := executeClientAction(`https://www.tiktok.com/@`+username, "bootstrapIteratingVideos()") jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit)
actionOutput := executeClientAction(`https://www.tiktok.com/@`+username, jsMethod)
return models.ParseUploads(actionOutput) return models.ParseUploads(actionOutput)
} }
func GetUserUploadsJson(username string) string {
jsMethod := fmt.Sprintf("bootstrapIteratingVideos(%d)", config.Config.Limit)
return executeClientAction(`https://www.tiktok.com/@`+username, jsMethod)
}

View File

@ -1,48 +1,57 @@
package config package config
import ( import (
"flag" "flag"
"fmt" "fmt"
"os" "os"
) )
// Config - Runtime configuration // Config - Runtime configuration
var Config struct { var Config struct {
URL string URL string
OutputPath string OutputPath string
BatchFilePath string BatchFilePath string
Debug bool Debug bool
MetaData bool MetaData bool
Quiet bool Quiet bool
Deadline int Deadline int
Limit int
JSONOnly bool
} }
// GetConfig - Returns Config object // GetConfig - Returns Config object
func GetConfig() { func GetConfig() {
outputPath := flag.String("output", "./downloads", "Output path") outputPath := flag.String("output", "./downloads", "Output path")
batchFilePath := flag.String("batch-file", "", "File containing URLs/Usernames to download, one value per line. Lines starting with '#', are considered as comments and ignored.") batchFilePath := flag.String("batch-file", "", "File containing URLs/Usernames to download, one value per line. Lines starting with '#', are considered as comments and ignored.")
debug := flag.Bool("debug", false, "Enables debug mode") debug := flag.Bool("debug", false, "Enables debug mode")
metadata := flag.Bool("metadata", false, "Write video metadata to a .json file") metadata := flag.Bool("metadata", false, "Write video metadata to a .json file")
quiet := flag.Bool("quiet", false, "Supress output") quiet := flag.Bool("quiet", false, "Supress output")
deadline := flag.Int("deadline", 1500, "Sets the timout for scraper logic in seconds (used as a workaround for 'context deadline exceeded' error)") deadline := flag.Int("deadline", 1500, "Sets the timout for scraper logic in seconds (used as a workaround for 'context deadline exceeded' error)")
flag.Parse() limit := flag.Int("limit", 0, "Sets the videos count limit (useful when there too many videos from the user or by hashtag)")
jsonOnly := flag.Bool("json", false, "Just get JSON data from scraper (without video downloading)")
flag.Parse()
args := flag.Args() args := flag.Args()
if len(args) < 1 && *batchFilePath == "" { if len(args) < 1 && *batchFilePath == "" {
fmt.Println("Usage: tiktok-dl [OPTIONS] TIKTOK_USERNAME|TIKTOK_URL") fmt.Println("Usage: tiktok-dl [OPTIONS] TIKTOK_USERNAME|TIKTOK_URL")
fmt.Println(" or: tiktok-dl [OPTIONS] -batch-file path/to/users.txt") fmt.Println(" or: tiktok-dl [OPTIONS] -batch-file path/to/users.txt")
os.Exit(2) os.Exit(2)
} }
if len(args) > 0 { if len(args) > 0 {
Config.URL = flag.Args()[len(args)-1] Config.URL = flag.Args()[len(args)-1]
} else { } else {
Config.URL = "" Config.URL = ""
} }
Config.OutputPath = *outputPath Config.OutputPath = *outputPath
Config.BatchFilePath = *batchFilePath Config.BatchFilePath = *batchFilePath
Config.Debug = *debug Config.Debug = *debug
Config.MetaData = *metadata Config.MetaData = *metadata
Config.Quiet = *quiet Config.Quiet = *quiet
Config.Deadline = *deadline if *jsonOnly {
Config.Quiet = true
}
Config.Deadline = *deadline
Config.Limit = *limit
Config.JSONOnly = *jsonOnly;
} }

View File

@ -1,7 +1,7 @@
optStrings = { optStrings = {
selectors: { selectors: {
feedLoading: 'div.tiktok-loading.feed-loading', feedLoading: 'div.tiktok-loading.feed-loading',
modalArrowLeft: 'div.video-card-modal > div > img.arrow-right', modalArrowRight: 'div.video-card-modal > div > img.arrow-right',
modalClose: '.video-card-modal > div > div.close', modalClose: '.video-card-modal > div > div.close',
modalPlayer: 'div > div > main > div.video-card-modal > div > div.video-card-big > div.video-card-container > div > div > video', modalPlayer: 'div > div > main > div.video-card-modal > div > div.video-card-big > div.video-card-container > div > div > video',
modalShareInput: '.copy-link-container > input', modalShareInput: '.copy-link-container > input',
@ -30,6 +30,7 @@ optStrings = {
currentState = { currentState = {
preloadCount: 0, preloadCount: 0,
finished: false, finished: false,
limit: 100
}; };
createVidUrlElement = function(outputObj) { createVidUrlElement = function(outputObj) {
@ -37,7 +38,7 @@ createVidUrlElement = function(outputObj) {
urlSetElement.innerText = JSON.stringify(outputObj); urlSetElement.innerText = JSON.stringify(outputObj);
document.getElementsByTagName(optStrings.tags.resultParentTag)[0].appendChild(urlSetElement); document.getElementsByTagName(optStrings.tags.resultParentTag)[0].appendChild(urlSetElement);
currentState.finished = true; currentState.finished = true;
} };
buldVidUrlArray = function(finishCallback) { buldVidUrlArray = function(finishCallback) {
var feedItem = document.getElementsByClassName(optStrings.classes.feedVideoItem)[0]; var feedItem = document.getElementsByClassName(optStrings.classes.feedVideoItem)[0];
@ -46,8 +47,14 @@ buldVidUrlArray = function(finishCallback) {
var videoArray = []; var videoArray = [];
var intervalID = window.setInterval(x => { var intervalID = window.setInterval(x => {
videoArray.push(getCurrentModalVideo()); videoArray.push(getCurrentModalVideo());
if(currentState.limit > 0) {
var arrowRight = document.querySelectorAll(optStrings.selectors.modalArrowLeft)[0]; if (videoArray.length >= currentState.limit) {
window.clearInterval(intervalID);
document.querySelector(optStrings.selectors.modalClose).click();
finishCallback(videoArray);
}
}
var arrowRight = document.querySelectorAll(optStrings.selectors.modalArrowRight)[0];
if (arrowRight.classList.contains(optStrings.classes.modalCloseDisabled)) { if (arrowRight.classList.contains(optStrings.classes.modalCloseDisabled)) {
window.clearInterval(intervalID); window.clearInterval(intervalID);
document.querySelector(optStrings.selectors.modalClose).click(); document.querySelector(optStrings.selectors.modalClose).click();
@ -78,7 +85,7 @@ getCurrentModalVideo = function() {
link: soundHref, link: soundHref,
}, },
}; };
} };
getCurrentVideo = function() { getCurrentVideo = function() {
var player = document.querySelector(optStrings.selectors.videoPlayer); var player = document.querySelector(optStrings.selectors.videoPlayer);
@ -100,13 +107,19 @@ getCurrentVideo = function() {
link: soundHref, link: soundHref,
}, },
}; };
} };
scrollWhileNew = function(finishCallback) { scrollWhileNew = function(finishCallback) {
var state = { count: 0 }; var state = { count: 0 };
var intervalID = window.setInterval(x => { var intervalID = window.setInterval(x => {
var oldCount = state.count; var oldCount = state.count;
state.count = document.getElementsByClassName(optStrings.classes.feedVideoItem).length; state.count = document.getElementsByClassName(optStrings.classes.feedVideoItem).length;
if(currentState.limit > 0) {
if (currentState.preloadCount >= currentState.limit || state.count >= currentState.limit) {
finishCallback(createVidUrlElement);
window.clearInterval(intervalID);
}
}
if (oldCount !== state.count) { if (oldCount !== state.count) {
currentState.preloadCount = state.count; currentState.preloadCount = state.count;
window.scrollTo(0, document.body.scrollHeight); window.scrollTo(0, document.body.scrollHeight);
@ -121,7 +134,8 @@ scrollWhileNew = function(finishCallback) {
}, 1000); }, 1000);
}; };
bootstrapIteratingVideos = function() { bootstrapIteratingVideos = function(limit) {
currentState.limit = limit;
scrollWhileNew(buldVidUrlArray); scrollWhileNew(buldVidUrlArray);
return 'bootstrapIteratingVideos'; return 'bootstrapIteratingVideos';
}; };
@ -130,7 +144,7 @@ bootstrapGetCurrentVideo = function() {
var video = getCurrentVideo(); var video = getCurrentVideo();
createVidUrlElement(video); createVidUrlElement(video);
return 'bootstrapGetCurrentVideo'; return 'bootstrapGetCurrentVideo';
} };
init = () => { init = () => {
const newProto = navigator.__proto__; const newProto = navigator.__proto__;

16
utils/getHashtag.go Normal file
View File

@ -0,0 +1,16 @@
package utils
import (
res "../resources"
"fmt"
"strings"
)
// GetHashtagFromURL - Get's tag name from passed url
func GetHashtagFromURL(str string) string {
if match := strings.Contains(str, "/tag/"); match {
return strings.Split(str, "/tag/")[1]
}
panic(fmt.Sprintf(res.ErrorCouldNotRecogniseURL, str))
}

View File

@ -0,0 +1,36 @@
package workflows
import (
client "../client"
config "../models/config"
utils "../utils"
"fmt"
"strings"
)
// CanUseDownloadHashtag - Test's if this workflow can be used for parameter
func CanUseDownloadHashtag(url string) bool {
match := strings.Contains(url, "/tag/")
return match
}
// DownloadHashtag - Download videos marked with given hashtag
func DownloadHashtag(url string) {
uploads := client.GetHashtagUploads(url)
uploadCount := len(uploads)
hashtag := utils.GetHashtagFromURL(url)
downloadDir := fmt.Sprintf("%s/%s", config.Config.OutputPath, hashtag)
utils.InitOutputDirectory(downloadDir)
for index, upload := range uploads {
downloadVideo(upload, downloadDir)
utils.Logf("\r[%d/%d] Downloaded", index+1, uploadCount)
}
utils.Log()
}
func GetHashtagJson(url string) {
uploads := client.GetHashtagUploads(url)
fmt.Printf("%s", uploads)
}

View File

@ -1,31 +1,36 @@
package workflows package workflows
import ( import (
client "../client" client "../client"
config "../models/config" config "../models/config"
utils "../utils" utils "../utils"
"fmt" "fmt"
"regexp" "regexp"
) )
// CanUseDownloadMusic - Check's if DownloadMusic can be used for parameter // CanUseDownloadMusic - Check's if DownloadMusic can be used for parameter
func CanUseDownloadMusic(url string) bool { func CanUseDownloadMusic(url string) bool {
match, _ := regexp.MatchString(".com\\/music\\/.+", url) match, _ := regexp.MatchString(".com\\/music\\/.+", url)
return match return match
} }
// DownloadMusic - Download all videos by given music // DownloadMusic - Download all videos by given music
func DownloadMusic(url string) { func DownloadMusic(url string) {
uploads := client.GetMusicUploads(url) uploads := client.GetMusicUploads(url)
uploadCount := len(uploads) uploadCount := len(uploads)
for index, upload := range uploads { for index, upload := range uploads {
username := utils.GetUsernameFromString(upload.Uploader) username := utils.GetUsernameFromString(upload.Uploader)
downloadDir := fmt.Sprintf("%s/%s", config.Config.OutputPath, username) downloadDir := fmt.Sprintf("%s/%s", config.Config.OutputPath, username)
utils.InitOutputDirectory(downloadDir) utils.InitOutputDirectory(downloadDir)
downloadVideo(upload, downloadDir) downloadVideo(upload, downloadDir)
utils.Logf("\r[%d/%d] Downloaded", index+1, uploadCount) utils.Logf("\r[%d/%d] Downloaded", index+1, uploadCount)
} }
utils.Log() utils.Log()
}
func GetMusicJson(url string) {
uploads := client.GetMusicUploadsJson(url)
fmt.Printf("%s", uploads)
} }

View File

@ -28,3 +28,8 @@ func DownloadUser(username string) {
} }
utils.Log() utils.Log()
} }
func GetUserVideosJson(username string) {
uploads := client.GetUserUploadsJson(username)
fmt.Printf("%s", uploads)
}

View File

@ -1,44 +1,44 @@
package workflows package workflows
import ( import (
client "../client" client "../client"
models "../models" models "../models"
config "../models/config" config "../models/config"
utils "../utils" utils "../utils"
"fmt" "fmt"
"regexp" "regexp"
) )
// CanUseDownloadSingleVideo - Check's if DownloadSingleVideo can be used for parameter // CanUseDownloadSingleVideo - Check's if DownloadSingleVideo can be used for parameter
func CanUseDownloadSingleVideo(url string) bool { func CanUseDownloadSingleVideo(url string) bool {
match, _ := regexp.MatchString("\\/@.+\\/video\\/[0-9]+", url) match, _ := regexp.MatchString("\\/@.+\\/video\\/[0-9]+", url)
return match return match
} }
// DownloadSingleVideo - Downloads single video // DownloadSingleVideo - Downloads single video
func DownloadSingleVideo(url string) { func DownloadSingleVideo(url string) {
username := utils.GetUsernameFromString(url) username := utils.GetUsernameFromString(url)
upload := client.GetVideoDetails(url) upload := client.GetVideoDetails(url)
downloadDir := fmt.Sprintf("%s/%s", config.Config.OutputPath, username) downloadDir := fmt.Sprintf("%s/%s", config.Config.OutputPath, username)
utils.InitOutputDirectory(downloadDir) utils.InitOutputDirectory(downloadDir)
downloadVideo(upload, downloadDir) downloadVideo(upload, downloadDir)
utils.Log("[1/1] Downloaded\n") utils.Log("[1/1] Downloaded\n")
} }
// DownloadVideo - Downloads one video // DownloadVideo - Downloads one video
func downloadVideo(upload models.Upload, downloadDir string) { func downloadVideo(upload models.Upload, downloadDir string) {
uploadID := upload.GetUploadID() uploadID := upload.GetUploadID()
downloadPath := fmt.Sprintf("%s/%s.mp4", downloadDir, uploadID) downloadPath := fmt.Sprintf("%s/%s.mp4", downloadDir, uploadID)
if utils.CheckIfExists(downloadPath) { if utils.CheckIfExists(downloadPath) {
return return
} }
utils.DownloadFile(downloadPath, upload.URL) utils.DownloadFile(downloadPath, upload.URL)
if config.Config.MetaData { if config.Config.MetaData {
metadataPath := fmt.Sprintf("%s/%s.json", downloadDir, uploadID) metadataPath := fmt.Sprintf("%s/%s.json", downloadDir, uploadID)
upload.WriteToFile(metadataPath) upload.WriteToFile(metadataPath)
} }
} }

View File

@ -1,6 +1,7 @@
package workflows package workflows
import ( import (
config "../models/config"
res "../resources" res "../resources"
utils "../utils" utils "../utils"
) )
@ -10,7 +11,11 @@ func StartWorkflowByParameter(url string) {
// Music // Music
if CanUseDownloadMusic(url) { if CanUseDownloadMusic(url) {
DownloadMusic(url) if config.Config.JSONOnly {
GetMusicJson(url)
} else {
DownloadMusic(url)
}
return return
} }
@ -22,7 +27,22 @@ func StartWorkflowByParameter(url string) {
// Tiktok user // Tiktok user
if CanUseDownloadUser(url) { if CanUseDownloadUser(url) {
DownloadUser(utils.GetUsernameFromString(url)) if config.Config.JSONOnly {
GetUserVideosJson(utils.GetUsernameFromString(url))
} else {
DownloadUser(utils.GetUsernameFromString(url))
}
return
}
// Tiktok hashtag
if CanUseDownloadHashtag(url) {
if config.Config.JSONOnly {
GetHashtagJson(url)
} else {
DownloadHashtag(url)
}
return return
} }