Source code for eedl.google_cloud

import os
import re
from pathlib import Path
from typing import List, Union

import requests


from google.cloud import storage  # type: ignore



[docs]
def get_public_export_urls(bucket_name: str, prefix: str = "") -> List[str]:
	"""
	Downloads items from a *public* Google Cloud Storage Bucket without using a GCloud login. Filters only to files.
	with the specified prefix.

	Args:
		bucket_name (str): Name of the Google Cloud Storage Bucket to pull data from.
		prefix (str): A prefix to use to filter items in the bucket - only URLs where the path matches this prefix will be returned - defaults to all files.

	Returns:
		List[str]: A list of urls.
	"""

	base_url = "https://storage.googleapis.com/"
	request_url = f"{base_url}{bucket_name}/"
	search_url = f"{request_url}?prefix={prefix}"  # need to include the prefix here or else we get failures after having more than 1k items

	# get the content of the bucket (it needs to be public)
	listing = requests.get(search_url).text

	# Comes back as an XML listing - don't need to parse the XML, just need the values of the Key elements
	pattern = re.compile("<Key>(.*?)</Key>")
	items = pattern.findall(listing)
	# Make them into full URLs with the bucket URL at the front and check if the files have the prefix specific
	filtered = [f"{request_url}{item}" for item in items if item.startswith(prefix)]

	return filtered




[docs]
def download_public_export(bucket_name: str, output_folder: Union[str, Path], prefix: str = "") -> None:
	"""

	Args:
		bucket_name (str): Name of the Google Cloud Storage Bucket to pull data from.
		output_folder (Union[str, Path]): Destination folder for exported data.
		prefix (str): A prefix to use to filter items in the bucket - only URLs where the path matches this prefix will be returned - defaults to all files.

	Returns:
		None
	"""
	# Get the urls of items in the bucket with the specified prefix
	urls = get_public_export_urls(bucket_name, prefix)

	os.makedirs(output_folder, exist_ok=True)

	for url in urls:
		filename = url.split("/")[-1]  # Get the filename
		output_path = Path(output_folder) / filename  # Construct the output path
		# Get the data - this could be a problem if it's larger than fits in RAM - I believe requests has a way to operate as a streambuffer - not looking into that at this moment
		response = requests.get(url)
		output_path.write_bytes(response.content)  # Write it to a file




[docs]
def download_export(bucket_name: str,
					output_folder: Union[str, Path],
					prefix: str,
					delimiter: str = "/",
					autodelete: bool = True) -> None:

	"""
	Downloads a blob from the specified bucket.

	Modified from Google Cloud sample documentation at
		https://cloud.google.com/storage/docs/samples/storage-download-file#storage_download_file-python
		and
		https://cloud.google.com/storage/docs/samples/storage-list-files-with-prefix

	Args:
		bucket_name (str): Name of the Google Cloud Storage Bucket to pull data from.
		output_folder (Union[str, Path]): Destination folder for exported data.
		prefix (str): A prefix to use to filter items in the bucket - only URLs where the path matches this prefix will be returned - defaults to all files.
		delimiter (str): Delimiter used for getting the list of blobs in the Google Cloud Storage Bucket. Defaults to "/"
		autodelete (bool): Bool for deleting blobs once contents have been installed. Defaults to True
	Returns:
		None
	"""
	# The ID of your GCS bucket
	# bucket_name = "your-bucket-name"

	# The ID of your GCS object
	# source_blob_name = "storage-object-name"

	# The path to which the file should be downloaded
	# destination_file_name = "local/path/to/file"

	storage_client = storage.Client()

	bucket = storage_client.bucket(bucket_name)
	blobs = storage_client.list_blobs(bucket_name, prefix=prefix, delimiter=delimiter)

	for blob in blobs:
		if blob.name.startswith(prefix):
			destination_file_name = os.path.join(output_folder, blob.name)
			# Construct a client side representation of a blob.
			# Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
			# any content from Google Cloud Storage. As we don't need additional data,
			# using `Bucket.blob` is preferred here.
			blob_data = bucket.blob(blob.name)
			blob_data.download_to_filename(destination_file_name)
			if autodelete:
				blob_data.delete()


# print(
# "Downloaded storage object {} from bucket {} to local file {}.".format(
# source_blob_name, bucket_name, destination_file_name
# )
# )