################################################################################
# This file is a part of the following submission:
#   "Linear state-space model with time-varying dynamics"
#   Luttinen, Raiko, Ilin (ECML 2014)
#
# Copyright (C) 2013-2014 Jaakko Luttinen
#
# This file is licensed under Version 3.0 of the GNU General Public
# License. See LICENSE for a text of the license.
################################################################################

"""
This module contains functions for parsing, loading and saving GSOD data of NCDC

You can download the data for 2000-2009 as follows:

.. code-block:: console

    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/ish-history.csv
    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2000/gsod_2000.tar
    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2001/gsod_2001.tar
    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2002/gsod_2002.tar
    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2003/gsod_2003.tar
    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2004/gsod_2004.tar
    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2005/gsod_2005.tar
    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2006/gsod_2006.tar
    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2007/gsod_2007.tar
    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2008/gsod_2008.tar
    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/2009/gsod_2009.tar

or using a for-loop:

.. code-block:: console

    wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/ish-history.csv
    for year in {2000..2009..1}
    do
        wget ftp://ftp.ncdc.noaa.gov/pub/data/gsod/$year/gsod_$year.tar
    done

Data is available since 1901, in case you want to download other time periods.

For parsing the raw GSOD data from 2000-2009 into a usable HDF format, use:

.. code-block:: console

    python gsod.py --path=/path/to/gsod/data/ --first=2000 --last=2009

Default path is the current directory, default time period is 2000-2009.  Note
that this may take several hours.
"""

import numpy as np
import scipy
import matplotlib.pyplot as plt

import datetime

from matplotlib.ticker import NullLocator
from matplotlib.dates import AutoDateLocator, AutoDateFormatter

from bayespy.utils import misc
from bayespy.utils import random

import sys
import getopt
import os
import tempfile

import h5py

from scipy import constants


def get_filename(begin_year, end_year):
    """
    Returns the name of the HDF5 file for the given time period
    """
    return 'gsod_data_%d-%d.hdf5' % (begin_year, end_year)


def parse_data(begin_year=2000, end_year=2009, path='.'):
    """
    Parse the raw Global Summary of the Day -dataset into matrix form.

    Use daily mean temperature, coordinates and timestamps. Convert temperatures
    to Celsius. Save the resulting data in HDF5 format.

    This implementation is extremely simple and not optimized. It is meant to
    parse only short time periods. Parsing even a few years may take hours.
    """

    import gzip
    import tarfile
    from matplotlib.dates import datestr2num

    def pressure_converter(x):
        if x == b'9999.9':
            return np.nan
        else:
            return float(x)
        
    Y = np.empty((0,6))
    for year in range(begin_year, end_year+1):
        # Open the yearly TAR file containing the zipped station data files
        filename = os.path.join(path, 'gsod_%d.tar' % year)
        tar = tarfile.open(filename)
        tar_infos = tar.getmembers()
        print('Processing year %d (%d stations)..' % (year, len(tar_infos)-1))
        for tar_info in tar_infos:
            # Unzip and read each zipped station data file
            zipped = tar.extractfile(tar_info)
            if zipped is not None:
                # In Python>=3.3, we could just:
                # f = gzip.open(zipped)
                # However, this doesn't work in Python 3.2. Thus we need to do
                # it in a complex way. Save the zip file-like object into a
                # temporary zip file and read that file.
                tmpf = tempfile.NamedTemporaryFile(suffix='.gz', delete=False)
                lines = zipped.readlines()
                for line in lines:
                    tmpf.write(line)
                f = tmpf.name
                tmpf.close()

                if os.path.isfile(f):
                    y = np.loadtxt(f, 
                                   skiprows=1, 
                                   usecols=(0, 1, 2, 3, 7, 9),
                                   converters={2: lambda s: datestr2num(str(int(s))),
                                               7: pressure_converter,
                                               9: pressure_converter})
                    Y = np.vstack([Y, y])

                # This was for the Python>=3.3 version:
                # f.close()

        tar.close()

    station_id = np.array([int("%06d%05d" % (usaf, wban)) 
                           for (usaf, wban) in zip(Y[:,0].astype(int), 
                                                   Y[:,1].astype(int))])
    (stations, station_ind) = np.unique(station_id,
                                        return_inverse=True)
    (times, time_ind) = np.unique(Y[:,2].astype(int), 
                                  return_inverse=True)
    temperature = np.empty((len(stations), len(times)))
    temperature[:] = np.nan
    temperature[station_ind, time_ind] = constants.F2C(Y[:,3])

    seapressure = np.empty((len(stations), len(times)))
    seapressure[:] = np.nan
    seapressure[station_ind, time_ind] = Y[:,4]
    
    pressure = np.empty((len(stations), len(times)))
    pressure[:] = np.nan
    pressure[station_ind, time_ind] = Y[:,5]
    
    # Converters that ignore the quotes and converts string to numeric
    def int_converter(x):
        if x == b'""' or x == b'"-99999"' or x == b'"-999999"':
            return np.nan
        else:
            return int(x[1:-1])
    def float_converter(x):
        if x == b'""' or x == b'"-99999"' or x == b'"-999999"':
            return np.nan
        else:
            return float(x[1:-1])

    # Parse station coordinates (LAT, LON, ELEV)
    coordinates = np.loadtxt(os.path.join(path, 'ish-history.csv'),
                             skiprows=1, 
                             usecols=(0,1,7,8,9), 
                             delimiter=',', 
                             converters={0: int_converter,
                                         1: int_converter,
                                         7: int_converter,
                                         8: int_converter,
                                         9: int_converter},
                             comments=None)

    # Find the coordinates of those stations that are in the dataset
    station_id = [int("%06d%05d" % (usaf, wban)) 
                  for (usaf, wban) in zip(coordinates[:,0].astype(int), 
                                          coordinates[:,1].astype(int))]
    inds = [station_id.index(station) if station_id.count(station) > 0 else np.nan 
            for station in stations]
    coordinates = [coordinates[ind,2:] if ~np.isnan(ind) else misc.nans(3)
                   for ind in inds]
    coordinates = np.array(coordinates)
    coordinates[:,:2] /= 1000 # convert to degrees from millidegrees

    filename = get_filename(begin_year, end_year)
    try:
        f = h5py.File(filename, 'w')
        misc.write_to_hdf5(f, temperature, 'temperature')
        misc.write_to_hdf5(f, pressure, 'pressure')
        misc.write_to_hdf5(f, seapressure, 'seapressure')
        misc.write_to_hdf5(f, coordinates, 'coordinates')
        misc.write_to_hdf5(f, times, 'time')
    finally:
        f.close()


def load_data(begin_year=2000, end_year=2009, remove_unlocated=False,
              nan_threshold=1.0, variable='temperature'):
    """
    Load GSOD data from HDF5 file.

    The time period must have been parsed earlier.
    """

    # Load file
    filename = get_filename(begin_year, end_year)
    data = h5py.File(filename, 'r')

    # Load variables
    y = data[variable][...]
    coordinates = data['coordinates'][...]
    time = data['time'][...]

    data.close()

    # Here you could remove some stations, e.g., if no observations
    if remove_unlocated:
        b = ~np.any(np.isnan(coordinates), axis=-1)
        y = y[b]
        coordinates = coordinates[b]

    if nan_threshold < 1.0:
        nans = np.sum(np.isnan(y), axis=-1)
        N = np.shape(y)[-1]
        ind = (nans/N) <= nan_threshold
        y = y[ind]
        coordinates = coordinates[ind]

    return (y, coordinates, time)


if __name__ == '__main__':
    try:
        opts, args = getopt.getopt(sys.argv[1:],
                                   "",
                                   [
                                       "path=",
                                       "first=",
                                       "last=",
                                   ])
    except getopt.GetoptError:
        print('python gsod.py <options>')
        print('--path=<INT>   Path to the GSOD directory downloaded from NCDC')
        print('--first=<INT>  The first year to parse')
        print('--last=<INT>   The last year to parse')
        sys.exit(2)

    kwargs = {}
    for opt, arg in opts:
        if opt == "--path":
            kwargs["path"] = arg
        elif opt == "--first":
            kwargs["begin_year"] = int(arg)
        elif opt == "--last":
            kwargs["end_year"] = int(arg)
        else:
            raise ValueError("Unhandled argument given")

    parse_data(**kwargs)

