Source code for eedl.google_cloud

import os
import re
from pathlib import Path
from typing import List, Union

import requests


from google.cloud import storage  # type: ignore


[docs] def get_public_export_urls(bucket_name: str, prefix: str = "") -> List[str]: """ Downloads items from a *public* Google Cloud Storage Bucket without using a GCloud login. Filters only to files. with the specified prefix. Args: bucket_name (str): Name of the Google Cloud Storage Bucket to pull data from. prefix (str): A prefix to use to filter items in the bucket - only URLs where the path matches this prefix will be returned - defaults to all files. Returns: List[str]: A list of urls. """ base_url = "https://storage.googleapis.com/" request_url = f"{base_url}{bucket_name}/" search_url = f"{request_url}?prefix={prefix}" # need to include the prefix here or else we get failures after having more than 1k items # get the content of the bucket (it needs to be public) listing = requests.get(search_url).text # Comes back as an XML listing - don't need to parse the XML, just need the values of the Key elements pattern = re.compile("<Key>(.*?)</Key>") items = pattern.findall(listing) # Make them into full URLs with the bucket URL at the front and check if the files have the prefix specific filtered = [f"{request_url}{item}" for item in items if item.startswith(prefix)] return filtered
[docs] def download_public_export(bucket_name: str, output_folder: Union[str, Path], prefix: str = "") -> None: """ Args: bucket_name (str): Name of the Google Cloud Storage Bucket to pull data from. output_folder (Union[str, Path]): Destination folder for exported data. prefix (str): A prefix to use to filter items in the bucket - only URLs where the path matches this prefix will be returned - defaults to all files. Returns: None """ # Get the urls of items in the bucket with the specified prefix urls = get_public_export_urls(bucket_name, prefix) os.makedirs(output_folder, exist_ok=True) for url in urls: filename = url.split("/")[-1] # Get the filename output_path = Path(output_folder) / filename # Construct the output path # Get the data - this could be a problem if it's larger than fits in RAM - I believe requests has a way to operate as a streambuffer - not looking into that at this moment response = requests.get(url) output_path.write_bytes(response.content) # Write it to a file
[docs] def download_export(bucket_name: str, output_folder: Union[str, Path], prefix: str, delimiter: str = "/", autodelete: bool = True) -> None: """ Downloads a blob from the specified bucket. Modified from Google Cloud sample documentation at https://cloud.google.com/storage/docs/samples/storage-download-file#storage_download_file-python and https://cloud.google.com/storage/docs/samples/storage-list-files-with-prefix Args: bucket_name (str): Name of the Google Cloud Storage Bucket to pull data from. output_folder (Union[str, Path]): Destination folder for exported data. prefix (str): A prefix to use to filter items in the bucket - only URLs where the path matches this prefix will be returned - defaults to all files. delimiter (str): Delimiter used for getting the list of blobs in the Google Cloud Storage Bucket. Defaults to "/" autodelete (bool): Bool for deleting blobs once contents have been installed. Defaults to True Returns: None """ # The ID of your GCS bucket # bucket_name = "your-bucket-name" # The ID of your GCS object # source_blob_name = "storage-object-name" # The path to which the file should be downloaded # destination_file_name = "local/path/to/file" storage_client = storage.Client() bucket = storage_client.bucket(bucket_name) blobs = storage_client.list_blobs(bucket_name, prefix=prefix, delimiter=delimiter) for blob in blobs: if blob.name.startswith(prefix): destination_file_name = os.path.join(output_folder, blob.name) # Construct a client side representation of a blob. # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve # any content from Google Cloud Storage. As we don't need additional data, # using `Bucket.blob` is preferred here. blob_data = bucket.blob(blob.name) blob_data.download_to_filename(destination_file_name) if autodelete: blob_data.delete()
# print( # "Downloaded storage object {} from bucket {} to local file {}.".format( # source_blob_name, bucket_name, destination_file_name # ) # )