diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index ca15bf2d..57b79f10 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -32,6 +32,33 @@ "details":"Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing.", "size":1 }, + "football_data":{ + "files":[ + [ + "E0.csv", "E1.csv", "E2.csv", "E3.csv" + ] + ], + "citation":"", + "license":null, + "urls":[ + "http://www.football-data.co.uk/mmz4281/" + ], + "details":"Results of English football matches since 1993/94 season.", + "size":1 + }, + "google_trends":{ + "files":[ + [ + ] + ], + "citation":"", + "license":null, + "urls":[ + "http://www.google.com/trends/" + ], + "details":"Google trends results.", + "size":0 + }, "osu_accad":{ "files":[ [ diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index 3c44703a..54e42733 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -1,5 +1,8 @@ +import csv import os +import copy import numpy as np +import pylab as pb import GPy import scipy.io import cPickle as pickle @@ -7,6 +10,8 @@ import zipfile import tarfile import datetime import json +import re + ipython_available=True try: import IPython @@ -32,11 +37,18 @@ neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/' # Read data resources from json file. # Don't do this when ReadTheDocs is scanning as it breaks things on_rtd = os.environ.get('READTHEDOCS', None) == 'True' #Checks if RTD is scanning + if not (on_rtd): path = os.path.join(os.path.dirname(__file__), 'data_resources.json') json_data=open(path).read() data_resources = json.loads(json_data) +if not (on_rtd): + path = os.path.join(os.path.dirname(__file__), 'football_teams.json') + json_data=open(path).read() + football_dict = json.loads(json_data) + + def prompt_user(prompt): """Ask user for agreeing to data set licenses.""" @@ -274,9 +286,55 @@ def della_gatta_TRP63_gene_expression(data_set='della_gatta', gene_number=None): Y = Y[:, None] return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set) + + +def football_data(season='1314', data_set='football_data'): + """Football data from English games since 1993. This downloads data from football-data.co.uk for the given season. """ + def league2num(string): + league_dict = {'E0':0, 'E1':1, 'E2': 2, 'E3': 3, 'EC':4} + return league_dict[string] + + def football2num(string): + if football_dict.has_key(string): + return football_dict[string] + else: + football_dict[string] = len(football_dict)+1 + return len(football_dict)+1 + + data_set_season = data_set + '_' + season + data_resources[data_set_season] = copy.deepcopy(data_resources[data_set]) + data_resources[data_set_season]['urls'][0]+=season + '/' + start_year = int(year[0:2]) + end_year = int(year[2:4]) + files = ['E0.csv', 'E1.csv', 'E2.csv', 'E3.csv'] + if start_year>4 and start_year < 93: + files += ['EC.csv'] + data_resources[data_set_season]['files'] = [files] + if not data_available(data_set_season): + download_data(data_set_season) + for file in reversed(files): + filename = os.path.join(data_path, data_set_season, file) + # rewrite files removing blank rows. + writename = os.path.join(data_path, data_set_season, 'temp.csv') + input = open(filename, 'rb') + output = open(writename, 'wb') + writer = csv.writer(output) + for row in csv.reader(input): + if any(field.strip() for field in row): + writer.writerow(row) + input.close() + output.close() + table = np.loadtxt(writename,skiprows=1, usecols=(0, 1, 2, 3, 4, 5), converters = {0: league2num, 1: pb.datestr2num, 2:football2num, 3:football2num}, delimiter=',') + X = table[:, :4] + Y = table[:, 4:] + return data_details_return({'X': X, 'Y': Y}, data_set) + +# This will be for downloading google trends data. def google_trends(query_terms=['big data', 'machine learning', 'data science'], data_set='google_trends'): + """Data downloaded from Google trends for given query terms.""" # Inspired by this notebook: # http://nbviewer.ipython.org/github/sahuguet/notebooks/blob/master/GoogleTrends%20meet%20Notebook.ipynb + # quote the query terms. for i, element in enumerate(query_terms): query_terms[i] = urllib2.quote(element) @@ -284,18 +342,20 @@ def google_trends(query_terms=['big data', 'machine learning', 'data science'], data = urllib2.urlopen(query).read() - # We need to do some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD. + # In the notebook they did some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD. header = """// Data table response\ngoogle.visualization.Query.setResponse(""" data = data[len(header):-2] data = re.sub('new Date\((\d+),(\d+),(\d+)\)', (lambda m: '"%s-%02d-%02d"' % (m.group(1).strip(), 1+int(m.group(2)), int(m.group(3)))), data) timeseries = json.loads(data) - import pandas as pd + #import pandas as pd columns = [k['label'] for k in timeseries['table']['cols']] rows = map(lambda x: [k['v'] for k in x['c']], timeseries['table']['rows']) - df = pd.DataFrame(rows, columns=columns) - df.set_index('Date', inplace=True) - df.plot(figsize=(16, 8)) - + terms = len(columns)-1 + X = np.asarray([(pb.datestr2num(row[0]), i) for i in range(terms) for row in rows ]) + Y = np.asarray([[row[i+1]] for i in range(terms) for row in rows ]) + output_info = columns[1:] + return data_details_return({'X': X, 'Y': Y, 'query_terms': output_info, 'info': "Data downloaded from google trends with query terms: " + ', '.join(output_info) + '.'}, data_set) + # The data sets def oil(data_set='three_phase_oil_flow'): """The three phase oil data from Bishop and James (1993).""" diff --git a/GPy/util/football_teams.json b/GPy/util/football_teams.json new file mode 100644 index 00000000..a4eb9c38 --- /dev/null +++ b/GPy/util/football_teams.json @@ -0,0 +1 @@ +{"Canvey Island": 94, "Crewe": 21, "Fleetwood Town": 134, "Wrexham": 89, "Barnet": 69, "Ipswich": 29, "Rochdale": 84, "Bristol Rvs": 70, "Liverpool": 10, "Chelsea": 20, "York": 113, "Newcastle": 18, "QPR": 28, "Middlesboro": 116, "Tranmere": 68, "Bury": 72, "Luton": 24, "AFC Wimbledon": 126, "West Ham": 15, "Braintree Town": 135, "Bournemouth": 58, "Hayes & Yeading": 130, "Rushden & D": 81, "Weymouth": 120, "Chesterfield": 48, "Exeter": 104, "Barnsley": 45, "Aldershot": 95, "Gateshead": 129, "Hartlepool": 55, "Newport County": 132, "Crystal Palace": 23, "Ebbsfleet": 123, "Wigan": 19, "Shrewsbury": 83, "Hereford": 105, "Stevenage": 111, "Grimsby": 73, "Crawley Town": 114, "Morecambe": 109, "Oldham": 61, "Aston Villa": 1, "Bristol City": 51, "Gravesend": 103, "Huddersfield": 60, "Reading": 33, "Nuneaton Town": 140, "AFC Telford United": 137, "Wycombe": 91, "Leeds": 43, "Colchester": 54, "Rotherham": 63, "Southport": 100, "Southampton": 37, "Darlington": 82, "Blackburn": 16, "Bath City": 133, "Yeovil": 62, "Leyton Orient": 75, "Forest Green": 101, "Chester": 80, "Halifax": 110, "Portsmouth": 11, "Woking": 108, "Histon": 125, "Man City": 7, "Northampton": 78, "Arsenal": 17, "Charlton": 14, "Middlesbrough": 9, "Watford": 41, "Nott'm Forest": 59, "Eastbourne Borough": 131, "Hull": 27, "Barrow": 127, "Doncaster": 52, "Carlisle": 92, "Gillingham": 53, "Accrington": 93, "Dartford": 139, "Altrincham": 112, "Scarborough": 106, "Northwich": 117, "Farsley": 124, "Tamworth": 96, "St. Albans": 119, "Alfreton Town": 136, "Mansfield": 86, "Macclesfield": 76, "Torquay": 87, "Brighton": 26, "Bradford": 56, "Lincoln": 77, "Brentford": 49, "Everton": 3, "Cambridge": 102, "Sheffield United": 35, "Stockport": 85, "Bolton": 2, "Southend": 65, "Cheltenham": 71, "Walsall": 64, "Preston": 42, "Peterboro": 79, "Birmingham": 6, "Boston": 90, "Burton": 97, "West Brom": 8, "Man United": 4, "Stafford Rangers": 118, "Wimbledon": 115, "Scunthorpe": 50, "Kidderminster": 107, "Millwall": 44, "Swansea": 67, "Norwich": 31, "Burnley": 22, "Sunderland": 13, "Sheffield Weds": 40, "Fulham": 5, "Dag and Red": 99, "Oxford": 74, "Stoke": 39, "Tottenham": 12, "Kettering Town": 128, "Coventry": 32, "Wolves": 38, "Port Vale": 66, "Milton Keynes Dons": 57, "Plymouth": 34, "Derby": 25, "Notts County": 88, "Leicester": 36, "Droylsden": 121, "Blackpool": 47, "Salisbury": 122, "Cardiff": 30, "Grays": 98, "Swindon": 46, "Hyde United": 138} \ No newline at end of file