Module pycurious.download

This module provides some simple functions to download or install files.

It is good practise to store a checksum for each file. A checksum is a unique signature used to identify a file, and ensure that the data contained within the file is legitimate. Here is a sample workflow for downloading EMAG2 (v3) and evaluating its checksum:

resource = {
     "local_file":"../../data/EMAG2_V3_20170530.npz",
     "md5":'c0898b6a91efb3f13783873a8b67380c',
     "url":"https://zenodo.org/record/3245551/files/EMAG2_V3_20170530.npz?download=1",
     "expected_size":"500Mb"
    }

download_cached_file(resource["url"], resource["local_file"], resource["md5"], resource["expected_size"])

The file will commence downloading if it does not already exist in the local directory or if the checksum (md5) does not match. If you do not know the checksum for a file, run

md5sum(filename)

to return its unique identifier.

Source code
# Copyright 2018-2019 Ben Mather, Robert Delhaye
#
# This file is part of PyCurious.
#
# PyCurious is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or any later version.
#
# PyCurious is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with PyCurious.  If not, see <http://www.gnu.org/licenses/>.

"""
This module provides some simple functions to download or install files.

It is good practise to store a checksum for each file. A checksum is a
unique signature used to identify a file, and ensure that the data
contained within the file is legitimate. Here is a sample workflow for
downloading EMAG2 (v3) and evaluating its checksum:

```python
resource = {
     "local_file":"../../data/EMAG2_V3_20170530.npz",
     "md5":'c0898b6a91efb3f13783873a8b67380c',
     "url":"https://zenodo.org/record/3245551/files/EMAG2_V3_20170530.npz?download=1",
     "expected_size":"500Mb"
    }

download_cached_file(resource["url"], resource["local_file"], resource["md5"], resource["expected_size"])
```

The file will commence downloading if it does not already exist in the local directory
or if the checksum (md5) does not match. If you do not know the checksum for a file, run

```python
md5sum(filename)
```

to return its unique identifier.
"""


def download_file(url, local_filename, expected_size="Unknown"):
    """
    Download files from a URL to a local filename.

    Args:
        url : str
            URL that points to the file to be downloaded
        local_filename : str
            download content to this filename
    """
    import requests
    import os
    import time
    import shutil

    # We might want to bundle some files if they are small / compressed or not readily available for download

    if os.path.isfile(url):
        shutil.copy(url, local_filename)

    else:
        r = requests.get(url, stream=True)

        start_time = time.time()
        last_time = start_time
        datasize = 0

        with open(local_filename, "wb") as f:
            for chunk in r.iter_content(chunk_size=10000000):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)
                    f.flush()
                    datasize += len(chunk)
                    if (time.time() - last_time) > 2.5:
                        print(
                            "{:.1f} Mb in {:.1f}s / {}".format(
                                datasize / 1.0e6,
                                time.time() - start_time,
                                expected_size,
                            )
                        )
                        last_time = time.time()

    return


def md5sum(filename):
    """
    Returns the checksum for a given file
    """
    import hashlib
    from functools import partial

    with open(filename, mode="rb") as f:
        d = hashlib.md5()
        for buf in iter(partial(f.read, 4096), b""):
            d.update(buf)
    return d.hexdigest()


def download_cached_file(
    location_url, local_file, expected_md5, expected_size="Unknown"
):
    """
    Download files from a URL to a local filename.

    Same as `download_file()` but checks whether the file has already
    been downloaded from its checksum.

    Args:
        location_url : str
            URL that points to the file to be downloaded
        local_file : str
            download content to this filename
        expected_md5 : str
            checksum belonging to that file
        expected_size : float / int / str (optional)
            optional size of file (default="Unknown")
    """
    import sys

    try:
        assert md5sum(local_file) == expected_md5
        print("Using cached file - {}".format(local_file))
        return 2

    except (IOError, AssertionError) as error_info:
        # No file or the wrong file ... best go download it
        # print "Assertion failed - {}".format(sys.exc_info())

        try:
            data_file = download_file(location_url, local_file, expected_size)
            print("Downloaded from {}".format(location_url))
            return 1

        except:
            print("Unable to download {} [{}] ".format(location_url, sys.exc_info()))
            return 0


def report_cached_file(local_file, expected_md5):
    """
    Report whether the local file matches its checksum

    Args:
        local_file : str
            string pointing to the local file
        expected_md5 : str
            checksum assigned to `local_file`.
    """
    import os
    import os.path

    if not os.path.isfile(local_file):
        print("Local file {} does not exist".format(local_file))
    else:
        if len(expected_md5) == 0 or md5sum(local_file) == expected_md5:
            print("Cached file {} is valid".format(local_file))
        else:
            print(
                "Cached file {} failed, checksum {}".format(
                    local_file, md5sum(local_file)
                )
            )

Functions

def download_cached_file(location_url, local_file, expected_md5, expected_size='Unknown')

Download files from a URL to a local filename.

Same as download_file() but checks whether the file has already been downloaded from its checksum.

Args

location_url : str
URL that points to the file to be downloaded
local_file : str
download content to this filename
expected_md5 : str
checksum belonging to that file
expected_size : float / int / str (optional)
optional size of file (default="Unknown")
Source code
def download_cached_file(
    location_url, local_file, expected_md5, expected_size="Unknown"
):
    """
    Download files from a URL to a local filename.

    Same as `download_file()` but checks whether the file has already
    been downloaded from its checksum.

    Args:
        location_url : str
            URL that points to the file to be downloaded
        local_file : str
            download content to this filename
        expected_md5 : str
            checksum belonging to that file
        expected_size : float / int / str (optional)
            optional size of file (default="Unknown")
    """
    import sys

    try:
        assert md5sum(local_file) == expected_md5
        print("Using cached file - {}".format(local_file))
        return 2

    except (IOError, AssertionError) as error_info:
        # No file or the wrong file ... best go download it
        # print "Assertion failed - {}".format(sys.exc_info())

        try:
            data_file = download_file(location_url, local_file, expected_size)
            print("Downloaded from {}".format(location_url))
            return 1

        except:
            print("Unable to download {} [{}] ".format(location_url, sys.exc_info()))
            return 0
def download_file(url, local_filename, expected_size='Unknown')

Download files from a URL to a local filename.

Args

url : str
URL that points to the file to be downloaded
local_filename : str
download content to this filename
Source code
def download_file(url, local_filename, expected_size="Unknown"):
    """
    Download files from a URL to a local filename.

    Args:
        url : str
            URL that points to the file to be downloaded
        local_filename : str
            download content to this filename
    """
    import requests
    import os
    import time
    import shutil

    # We might want to bundle some files if they are small / compressed or not readily available for download

    if os.path.isfile(url):
        shutil.copy(url, local_filename)

    else:
        r = requests.get(url, stream=True)

        start_time = time.time()
        last_time = start_time
        datasize = 0

        with open(local_filename, "wb") as f:
            for chunk in r.iter_content(chunk_size=10000000):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)
                    f.flush()
                    datasize += len(chunk)
                    if (time.time() - last_time) > 2.5:
                        print(
                            "{:.1f} Mb in {:.1f}s / {}".format(
                                datasize / 1.0e6,
                                time.time() - start_time,
                                expected_size,
                            )
                        )
                        last_time = time.time()

    return
def md5sum(filename)

Returns the checksum for a given file

Source code
def md5sum(filename):
    """
    Returns the checksum for a given file
    """
    import hashlib
    from functools import partial

    with open(filename, mode="rb") as f:
        d = hashlib.md5()
        for buf in iter(partial(f.read, 4096), b""):
            d.update(buf)
    return d.hexdigest()
def report_cached_file(local_file, expected_md5)

Report whether the local file matches its checksum

Args

local_file : str
string pointing to the local file
expected_md5 : str
checksum assigned to local_file.
Source code
def report_cached_file(local_file, expected_md5):
    """
    Report whether the local file matches its checksum

    Args:
        local_file : str
            string pointing to the local file
        expected_md5 : str
            checksum assigned to `local_file`.
    """
    import os
    import os.path

    if not os.path.isfile(local_file):
        print("Local file {} does not exist".format(local_file))
    else:
        if len(expected_md5) == 0 or md5sum(local_file) == expected_md5:
            print("Cached file {} is valid".format(local_file))
        else:
            print(
                "Cached file {} failed, checksum {}".format(
                    local_file, md5sum(local_file)
                )
            )