Last active
April 13, 2023 00:58
-
-
Save harrisoncramer/43dc163e855e0dc46c394dabe70b756f to your computer and use it in GitHub Desktop.
Image Scraping Script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
This is a simple script that uses Google's CSE (custom search engine) | |
to query for images and store them into a folder locally. It expects | |
you to have in your environment path two variables: GOOGLE_CSE_ID | |
and GOOGLE_API_KEY. The first one is the ID of your custom search | |
engine and the second is your google API key. Create a folder | |
called images alongside the binary and then execute it. It expects | |
a CSV file strcutured as follows: | |
some cool looking cars,cars | |
a man being bamboozled,bamboozled | |
To invoke the binary, run: | |
./fetch-images your-csv-file.csv | |
*/ | |
package main | |
import ( | |
"encoding/csv" | |
"encoding/json" | |
"fmt" | |
"io" | |
"io/ioutil" | |
"net/http" | |
"os" | |
"path/filepath" | |
"strings" | |
) | |
type URL struct { | |
Type string `json:"type"` | |
Template string `json:"template"` | |
} | |
type Request struct { | |
Title string `json:"title"` | |
TotalResults string `json:"totalResults"` | |
SearchTerms string `json:"searchTerms"` | |
Count int `json:"count"` | |
StartIndex int `json:"startIndex"` | |
InputEncoding string `json:"inputEncoding"` | |
OutputEncoding string `json:"outputEncoding"` | |
Safe string `json:"safe"` | |
Cx string `json:"cx"` | |
SearchType string `json:"searchType"` | |
FileType string `json:"fileType"` | |
Rights string `json:"rights"` | |
ImgSize string `json:"imgSize"` | |
ImgType string `json:"imgType"` | |
ImgColorType string `json:"imgColorType"` | |
ImgDominantColor string `json:"imgDominantColor"` | |
} | |
type Queries struct { | |
Request []Request `json:"request"` | |
NextPage []Request `json:"nextPage"` | |
} | |
type Image struct { | |
ContextLink string `json:"contextLink"` | |
Height int `json:"height"` | |
Width int `json:"width"` | |
ByteSize int `json:"byteSize"` | |
ThumbnailLink string `json:"thumbnailLink"` | |
ThumbnailHeight int `json:"thumbnailHeight"` | |
ThumbnailWidth int `json:"thumbnailWidth"` | |
} | |
type Result struct { | |
Kind string `json:"kind"` | |
Title string `json:"title"` | |
HtmlTitle string `json:"htmlTitle"` | |
Link string `json:"link"` | |
DisplayLink string `json:"displayLink"` | |
Snippet string `json:"snippet"` | |
HtmlSnippet string `json:"htmlSnippet"` | |
Mime string `json:"mime"` | |
FileFormat string `json:"fileFormat"` | |
Image Image `json:"image"` | |
} | |
type SearchInformation struct { | |
SearchTime float64 `json:"searchTime"` | |
FormattedSearchTime string `json:"formattedSearchTime"` | |
TotalResults string `json:"totalResults"` | |
FormattedTotalResults string `json:"formattedTotalResults"` | |
} | |
type Context struct { | |
Title string `json:"title"` | |
} | |
type CustomSearch struct { | |
Kind string `json:"kind"` | |
URL URL `json:"url"` | |
Queries Queries `json:"queries"` | |
Context Context `json:"context"` | |
SearchInformation SearchInformation `json:"searchInformation"` | |
Items []Result `json:"items"` | |
} | |
type ImageData struct { | |
Image string `json:"image"` | |
Description string `json:"description"` | |
} | |
const ( | |
url = "https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=" | |
method = "GET" | |
) | |
func main() { | |
// Check if CSV file is provided as argument | |
if len(os.Args) != 2 { | |
fmt.Println("Usage: ./fetch-images <csv_file>") | |
os.Exit(1) | |
} | |
// Open the CSV file | |
csvFile, err := os.Open(os.Args[1]) | |
if err != nil { | |
fmt.Println("Error opening CSV file:", err) | |
os.Exit(1) | |
} | |
defer csvFile.Close() | |
// Create a CSV reader | |
reader := csv.NewReader(csvFile) | |
client := &http.Client{} | |
// Read the CSV file line by line | |
for { | |
record, err := reader.Read() | |
if err == io.EOF { | |
break | |
} | |
if err != nil { | |
fmt.Println("Error reading CSV file:", err) | |
os.Exit(1) | |
} | |
// Extract the image URL and description from the CSV record | |
imageType := record[0] | |
description := record[1] | |
formattedUrl := fmt.Sprintf(url, os.Getenv("GOOGLE_API_KEY"), os.Getenv("GOOGLE_CSE_ID")) | |
formattedUrl += "%22" | |
query := strings.Replace(imageType, " ", "+", -1) | |
formattedUrl += fmt.Sprintf("%s", query) | |
formattedUrl += "%22" | |
formattedUrl += "&searchType=image" | |
fmt.Println(formattedUrl) | |
req, err := http.NewRequest(method, formattedUrl, nil) | |
if err != nil { | |
fmt.Println(err) | |
continue | |
} | |
res, err := client.Do(req) | |
if err != nil { | |
fmt.Println(err) | |
continue | |
} | |
defer res.Body.Close() | |
body, err := ioutil.ReadAll(res.Body) | |
if err != nil { | |
fmt.Println(err) | |
continue | |
} | |
if res.StatusCode != 200 { | |
fmt.Println("Non-200 status code returned: ", res.StatusCode) | |
continue | |
} | |
var searchResult CustomSearch | |
err = json.Unmarshal(body, &searchResult) | |
if err != nil { | |
fmt.Println(err) | |
continue | |
} | |
if len(searchResult.Items) == 0 { | |
fmt.Println("No results found for", description) | |
continue | |
} | |
for i := 0; i < len(searchResult.Items) && i <= 5; i++ { | |
imageUrl := searchResult.Items[i].Link | |
// Make another API call to download the image | |
response, err := http.Get(imageUrl) | |
if err != nil { | |
fmt.Println("Error making API call:", err) | |
continue | |
} | |
defer response.Body.Close() | |
// Create a file in "images" folder to save the image | |
fileName := fmt.Sprintf("%s_%d.jpg", description, i) | |
filePath := filepath.Join("images", fileName) | |
file, err := os.Create(filePath) | |
if err != nil { | |
fmt.Println("Error creating file:", err) | |
continue | |
} | |
defer file.Close() | |
// Copy the image data to the file | |
_, err = io.Copy(file, response.Body) | |
if err != nil { | |
fmt.Println("Error saving image:", err) | |
continue | |
} | |
fmt.Printf("Image '%s' downloaded and saved as '%s'\n", description, fileName) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment