Source code for election_text_analysis.download_data

'''
    Tools to download all necessary data from ANES.

    This includes downloading/unzipping an overall timeseries file,
    as well as open-ended files by year.
'''

import urllib.request
import zipfile
import os

[docs] def download_all( filenames_and_urls = [ ("timeseries.zip", "https://electionstudies.org/anes_timeseries_cdf_csv_20220916/"), ("2020.xlsx", "https://electionstudies.org/anes_timeseries_2020_redactedopenends_excel_20211118/"), ("2016.xlsx", "https://electionstudies.org/wp-content/uploads/2016/02/anes_timeseries_2016_redacted_openends.xlsx"), ("2016_full.zip", "https://electionstudies.org/anes_timeseries_2016/"), ("2012.xlsx", "https://electionstudies.org/anes_timeseries_2012_openends/"), ("2008.xls", "https://electionstudies.org/wp-content/uploads/2008/03/anes_timeseries_2008_openends_redacted_Dec2012Revision.xls"), ], output_dir="downloaded_data" ): """ Downloads all necessary data to an output data directory. This uses urllib.request to download a list of specified links to their specified filenames. This function downloads the overall timeseries file, as well as 1984-2020 open-ended files. Parameters ---------- output_dir : str (optional, default="downloaded_data") An optional output directory to write the downloaded files to (defaults to downloaded_data) filenames_and_urls : list (optional) A list of (filename, url) tuples to download Returns ------- None Examples -------- >>> # This will download all files to the default downloaded_data directory >>> download_all() >>> # This will download all files to a download directory instead >>> download_all(output_dir="download") """ # This is a list of URLs to download from ANES along with their associated filename # If we don't have our output_dir folder yet, create it if not os.path.exists(output_dir): os.mkdir(output_dir) # Download each specified URL to the specified filename for filename, url in filenames_and_urls: download(filename, url, output_dir=output_dir)
[docs] def download(filename, url, output_dir="downloaded_data"): """ Downloads a single URL to a given filename. If the filename ends in *.zip, it is unzipped with the contents placed in the same output_dir. Parameters ---------- filename : str The name of the file to write to. If the filename ends in *.zip, it will be unzipped with the contents placed in the same output_dir url : str The URL to download the file from output_dir : str (optional, default="downloaded_data") An optional output directory to write the downloaded files to (defaults to downloaded_data) Returns ------- None Examples -------- >>> # This will download and unzip the timeseries data >>> download("timeseries.zip", "https://electionstudies.org/anes_timeseries_cdf_csv_20220916/") >>> # This will download the 2020 open-ended data >>> download("2020.xlsx", "https://electionstudies.org/anes_timeseries_2020_redactedopenends_excel_20211118/") """ print("Downloading", filename) # Assemble a request with the necessary headers to avoid detection # From https://stackoverflow.com/questions/38489386/how-to-fix-403-forbidden-errors-when-calling-apis-using-python-requests headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} request = urllib.request.Request(url, None, headers) # Read the response data from our request response = urllib.request.urlopen(request) data = response.read() # Write the data to a local file download_path = os.path.join(output_dir, filename) with open(download_path, "wb") as outputfile: outputfile.write(data) # If the output file is a .zip archive, unpack it to the output_dir directory if filename.lower().endswith(".zip"): # From https://stackoverflow.com/questions/3451111/unzipping-files-in-python with zipfile.ZipFile(download_path, 'r') as zip_ref: zip_ref.extractall(output_dir)