Skip to content

Instantly share code, notes, and snippets.

@harrisoncramer
Last active April 13, 2023 00:58
Show Gist options
  • Save harrisoncramer/43dc163e855e0dc46c394dabe70b756f to your computer and use it in GitHub Desktop.
Save harrisoncramer/43dc163e855e0dc46c394dabe70b756f to your computer and use it in GitHub Desktop.
Image Scraping Script
/*
This is a simple script that uses Google's CSE (custom search engine)
to query for images and store them into a folder locally. It expects
you to have in your environment path two variables: GOOGLE_CSE_ID
and GOOGLE_API_KEY. The first one is the ID of your custom search
engine and the second is your google API key. Create a folder
called images alongside the binary and then execute it. It expects
a CSV file strcutured as follows:
some cool looking cars,cars
a man being bamboozled,bamboozled
To invoke the binary, run:
./fetch-images your-csv-file.csv
*/
package main
import (
"encoding/csv"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/http"
"os"
"path/filepath"
"strings"
)
type URL struct {
Type string `json:"type"`
Template string `json:"template"`
}
type Request struct {
Title string `json:"title"`
TotalResults string `json:"totalResults"`
SearchTerms string `json:"searchTerms"`
Count int `json:"count"`
StartIndex int `json:"startIndex"`
InputEncoding string `json:"inputEncoding"`
OutputEncoding string `json:"outputEncoding"`
Safe string `json:"safe"`
Cx string `json:"cx"`
SearchType string `json:"searchType"`
FileType string `json:"fileType"`
Rights string `json:"rights"`
ImgSize string `json:"imgSize"`
ImgType string `json:"imgType"`
ImgColorType string `json:"imgColorType"`
ImgDominantColor string `json:"imgDominantColor"`
}
type Queries struct {
Request []Request `json:"request"`
NextPage []Request `json:"nextPage"`
}
type Image struct {
ContextLink string `json:"contextLink"`
Height int `json:"height"`
Width int `json:"width"`
ByteSize int `json:"byteSize"`
ThumbnailLink string `json:"thumbnailLink"`
ThumbnailHeight int `json:"thumbnailHeight"`
ThumbnailWidth int `json:"thumbnailWidth"`
}
type Result struct {
Kind string `json:"kind"`
Title string `json:"title"`
HtmlTitle string `json:"htmlTitle"`
Link string `json:"link"`
DisplayLink string `json:"displayLink"`
Snippet string `json:"snippet"`
HtmlSnippet string `json:"htmlSnippet"`
Mime string `json:"mime"`
FileFormat string `json:"fileFormat"`
Image Image `json:"image"`
}
type SearchInformation struct {
SearchTime float64 `json:"searchTime"`
FormattedSearchTime string `json:"formattedSearchTime"`
TotalResults string `json:"totalResults"`
FormattedTotalResults string `json:"formattedTotalResults"`
}
type Context struct {
Title string `json:"title"`
}
type CustomSearch struct {
Kind string `json:"kind"`
URL URL `json:"url"`
Queries Queries `json:"queries"`
Context Context `json:"context"`
SearchInformation SearchInformation `json:"searchInformation"`
Items []Result `json:"items"`
}
type ImageData struct {
Image string `json:"image"`
Description string `json:"description"`
}
const (
url = "https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q="
method = "GET"
)
func main() {
// Check if CSV file is provided as argument
if len(os.Args) != 2 {
fmt.Println("Usage: ./fetch-images <csv_file>")
os.Exit(1)
}
// Open the CSV file
csvFile, err := os.Open(os.Args[1])
if err != nil {
fmt.Println("Error opening CSV file:", err)
os.Exit(1)
}
defer csvFile.Close()
// Create a CSV reader
reader := csv.NewReader(csvFile)
client := &http.Client{}
// Read the CSV file line by line
for {
record, err := reader.Read()
if err == io.EOF {
break
}
if err != nil {
fmt.Println("Error reading CSV file:", err)
os.Exit(1)
}
// Extract the image URL and description from the CSV record
imageType := record[0]
description := record[1]
formattedUrl := fmt.Sprintf(url, os.Getenv("GOOGLE_API_KEY"), os.Getenv("GOOGLE_CSE_ID"))
formattedUrl += "%22"
query := strings.Replace(imageType, " ", "+", -1)
formattedUrl += fmt.Sprintf("%s", query)
formattedUrl += "%22"
formattedUrl += "&searchType=image"
fmt.Println(formattedUrl)
req, err := http.NewRequest(method, formattedUrl, nil)
if err != nil {
fmt.Println(err)
continue
}
res, err := client.Do(req)
if err != nil {
fmt.Println(err)
continue
}
defer res.Body.Close()
body, err := ioutil.ReadAll(res.Body)
if err != nil {
fmt.Println(err)
continue
}
if res.StatusCode != 200 {
fmt.Println("Non-200 status code returned: ", res.StatusCode)
continue
}
var searchResult CustomSearch
err = json.Unmarshal(body, &searchResult)
if err != nil {
fmt.Println(err)
continue
}
if len(searchResult.Items) == 0 {
fmt.Println("No results found for", description)
continue
}
for i := 0; i < len(searchResult.Items) && i <= 5; i++ {
imageUrl := searchResult.Items[i].Link
// Make another API call to download the image
response, err := http.Get(imageUrl)
if err != nil {
fmt.Println("Error making API call:", err)
continue
}
defer response.Body.Close()
// Create a file in "images" folder to save the image
fileName := fmt.Sprintf("%s_%d.jpg", description, i)
filePath := filepath.Join("images", fileName)
file, err := os.Create(filePath)
if err != nil {
fmt.Println("Error creating file:", err)
continue
}
defer file.Close()
// Copy the image data to the file
_, err = io.Copy(file, response.Body)
if err != nil {
fmt.Println("Error saving image:", err)
continue
}
fmt.Printf("Image '%s' downloaded and saved as '%s'\n", description, fileName)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment