Google trends and football data sets.

2026-07-26 17:11:06 +02:00 · 2014-03-13 15:59:11 +00:00 · 2014-03-13 15:59:11 +00:00 · 73e877a458
commit 73e877a458
parent 86f92869a1
3 changed files with 94 additions and 6 deletions
--- a/GPy/util/data_resources.json
+++ b/GPy/util/data_resources.json
@ -32,6 +32,33 @@
      "details":"Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing.",
      "size":1
   },
+   "football_data":{
+      "files":[
+         [
+	     "E0.csv", "E1.csv", "E2.csv", "E3.csv"
+         ]
+      ],
+      "citation":"",
+      "license":null,
+      "urls":[
+          "http://www.football-data.co.uk/mmz4281/"
+      ],
+      "details":"Results of English football matches since 1993/94 season.",
+      "size":1
+   },
+   "google_trends":{
+      "files":[
+         [
+         ]
+      ],
+      "citation":"",
+      "license":null,
+      "urls":[
+          "http://www.google.com/trends/"
+      ],
+      "details":"Google trends results.",
+      "size":0
+   },
   "osu_accad":{
      "files":[
         [
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -1,5 +1,8 @@
+import csv
 import os
+import copy
 import numpy as np
+import pylab as pb
 import GPy
 import scipy.io
 import cPickle as pickle
@ -7,6 +10,8 @@ import zipfile
 import tarfile
 import datetime
 import json
+import re
+
 ipython_available=True
 try:
    import IPython
@ -32,11 +37,18 @@ neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/'
 # Read data resources from json file.
 # Don't do this when ReadTheDocs is scanning as it breaks things
 on_rtd = os.environ.get('READTHEDOCS', None) == 'True' #Checks if RTD is scanning
+
 if not (on_rtd):
    path = os.path.join(os.path.dirname(__file__), 'data_resources.json')
    json_data=open(path).read()
    data_resources = json.loads(json_data)

+if not (on_rtd):
+    path = os.path.join(os.path.dirname(__file__), 'football_teams.json')
+    json_data=open(path).read()
+    football_dict = json.loads(json_data)
+
+    

 def prompt_user(prompt):
    """Ask user for agreeing to data set licenses."""
@ -274,9 +286,55 @@ def della_gatta_TRP63_gene_expression(data_set='della_gatta', gene_number=None):
            Y = Y[:, None]
    return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set)

+    
+
+def football_data(season='1314', data_set='football_data'):
+    """Football data from English games since 1993. This downloads data from football-data.co.uk for the given season. """
+    def league2num(string):
+        league_dict = {'E0':0, 'E1':1, 'E2': 2, 'E3': 3, 'EC':4}
+        return league_dict[string]
+
+    def football2num(string):
+        if football_dict.has_key(string):
+            return football_dict[string]
+        else:
+            football_dict[string] = len(football_dict)+1
+            return len(football_dict)+1
+
+    data_set_season = data_set + '_' + season
+    data_resources[data_set_season] = copy.deepcopy(data_resources[data_set])
+    data_resources[data_set_season]['urls'][0]+=season + '/'
+    start_year = int(year[0:2])
+    end_year = int(year[2:4])
+    files = ['E0.csv', 'E1.csv', 'E2.csv', 'E3.csv']
+    if start_year>4 and start_year < 93:
+        files += ['EC.csv']
+    data_resources[data_set_season]['files'] = [files]
+    if not data_available(data_set_season):
+        download_data(data_set_season)
+    for file in reversed(files):
+        filename = os.path.join(data_path, data_set_season, file)
+        # rewrite files removing blank rows.
+        writename = os.path.join(data_path, data_set_season, 'temp.csv')
+        input = open(filename, 'rb')
+        output = open(writename, 'wb')
+        writer = csv.writer(output)
+        for row in csv.reader(input):
+            if any(field.strip() for field in row):
+                writer.writerow(row)
+        input.close()
+        output.close()
+        table = np.loadtxt(writename,skiprows=1, usecols=(0, 1, 2, 3, 4, 5), converters = {0: league2num, 1: pb.datestr2num, 2:football2num, 3:football2num}, delimiter=',')
+        X = table[:, :4]
+        Y = table[:, 4:]
+    return data_details_return({'X': X, 'Y': Y}, data_set)
+
+# This will be for downloading google trends data.
 def google_trends(query_terms=['big data', 'machine learning', 'data science'], data_set='google_trends'):
+    """Data downloaded from Google trends for given query terms."""
    # Inspired by this notebook:
    # http://nbviewer.ipython.org/github/sahuguet/notebooks/blob/master/GoogleTrends%20meet%20Notebook.ipynb
+
    # quote the query terms.
    for i, element in enumerate(query_terms):
        query_terms[i] = urllib2.quote(element)
@ -284,18 +342,20 @@ def google_trends(query_terms=['big data', 'machine learning', 'data science'],

    data = urllib2.urlopen(query).read()

-    # We need to do some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD.
+    # In the notebook they did some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD.
    header = """// Data table response\ngoogle.visualization.Query.setResponse("""
    data = data[len(header):-2]
    data = re.sub('new Date\((\d+),(\d+),(\d+)\)', (lambda m: '"%s-%02d-%02d"' % (m.group(1).strip(), 1+int(m.group(2)), int(m.group(3)))), data)
    timeseries = json.loads(data)
-    import pandas as pd
+    #import pandas as pd
    columns = [k['label'] for k in timeseries['table']['cols']]
    rows = map(lambda x: [k['v'] for k in x['c']], timeseries['table']['rows'])
-    df = pd.DataFrame(rows, columns=columns)
-    df.set_index('Date', inplace=True)
-    df.plot(figsize=(16, 8))
-
+    terms = len(columns)-1
+    X = np.asarray([(pb.datestr2num(row[0]), i) for i in range(terms) for row in rows ])
+    Y = np.asarray([[row[i+1]] for i in range(terms) for row in rows ])
+    output_info = columns[1:]
+    return data_details_return({'X': X, 'Y': Y, 'query_terms': output_info, 'info': "Data downloaded from google trends with query terms: " + ', '.join(output_info) + '.'}, data_set)
+    
 # The data sets
 def oil(data_set='three_phase_oil_flow'):
    """The three phase oil data from Bishop and James (1993)."""
--- a/GPy/util/football_teams.json
+++ b/GPy/util/football_teams.json
@ -0,0 +1 @@
+{"Canvey Island": 94, "Crewe": 21, "Fleetwood Town": 134, "Wrexham": 89, "Barnet": 69, "Ipswich": 29, "Rochdale": 84, "Bristol Rvs": 70, "Liverpool": 10, "Chelsea": 20, "York": 113, "Newcastle": 18, "QPR": 28, "Middlesboro": 116, "Tranmere": 68, "Bury": 72, "Luton": 24, "AFC Wimbledon": 126, "West Ham": 15, "Braintree Town": 135, "Bournemouth": 58, "Hayes & Yeading": 130, "Rushden & D": 81, "Weymouth": 120, "Chesterfield": 48, "Exeter": 104, "Barnsley": 45, "Aldershot": 95, "Gateshead": 129, "Hartlepool": 55, "Newport County": 132, "Crystal Palace": 23, "Ebbsfleet": 123, "Wigan": 19, "Shrewsbury": 83, "Hereford": 105, "Stevenage": 111, "Grimsby": 73, "Crawley Town": 114, "Morecambe": 109, "Oldham": 61, "Aston Villa": 1, "Bristol City": 51, "Gravesend": 103, "Huddersfield": 60, "Reading": 33, "Nuneaton Town": 140, "AFC Telford United": 137, "Wycombe": 91, "Leeds": 43, "Colchester": 54, "Rotherham": 63, "Southport": 100, "Southampton": 37, "Darlington": 82, "Blackburn": 16, "Bath City": 133, "Yeovil": 62, "Leyton Orient": 75, "Forest Green": 101, "Chester": 80, "Halifax": 110, "Portsmouth": 11, "Woking": 108, "Histon": 125, "Man City": 7, "Northampton": 78, "Arsenal": 17, "Charlton": 14, "Middlesbrough": 9, "Watford": 41, "Nott'm Forest": 59, "Eastbourne Borough": 131, "Hull": 27, "Barrow": 127, "Doncaster": 52, "Carlisle": 92, "Gillingham": 53, "Accrington": 93, "Dartford": 139, "Altrincham": 112, "Scarborough": 106, "Northwich": 117, "Farsley": 124, "Tamworth": 96, "St. Albans": 119, "Alfreton Town": 136, "Mansfield": 86, "Macclesfield": 76, "Torquay": 87, "Brighton": 26, "Bradford": 56, "Lincoln": 77, "Brentford": 49, "Everton": 3, "Cambridge": 102, "Sheffield United": 35, "Stockport": 85, "Bolton": 2, "Southend": 65, "Cheltenham": 71, "Walsall": 64, "Preston": 42, "Peterboro": 79, "Birmingham": 6, "Boston": 90, "Burton": 97, "West Brom": 8, "Man United": 4, "Stafford Rangers": 118, "Wimbledon": 115, "Scunthorpe": 50, "Kidderminster": 107, "Millwall": 44, "Swansea": 67, "Norwich": 31, "Burnley": 22, "Sunderland": 13, "Sheffield Weds": 40, "Fulham": 5, "Dag and Red": 99, "Oxford": 74, "Stoke": 39, "Tottenham": 12, "Kettering Town": 128, "Coventry": 32, "Wolves": 38, "Port Vale": 66, "Milton Keynes Dons": 57, "Plymouth": 34, "Derby": 25, "Notts County": 88, "Leicester": 36, "Droylsden": 121, "Blackpool": 47, "Salisbury": 122, "Cardiff": 30, "Grays": 98, "Swindon": 46, "Hyde United": 138}
				`@ -0,0 +1 @@`
				{"Canvey Island": 94, "Crewe": 21, "Fleetwood Town": 134, "Wrexham": 89, "Barnet": 69, "Ipswich": 29, "Rochdale": 84, "Bristol Rvs": 70, "Liverpool": 10, "Chelsea": 20, "York": 113, "Newcastle": 18, "QPR": 28, "Middlesboro": 116, "Tranmere": 68, "Bury": 72, "Luton": 24, "AFC Wimbledon": 126, "West Ham": 15, "Braintree Town": 135, "Bournemouth": 58, "Hayes & Yeading": 130, "Rushden & D": 81, "Weymouth": 120, "Chesterfield": 48, "Exeter": 104, "Barnsley": 45, "Aldershot": 95, "Gateshead": 129, "Hartlepool": 55, "Newport County": 132, "Crystal Palace": 23, "Ebbsfleet": 123, "Wigan": 19, "Shrewsbury": 83, "Hereford": 105, "Stevenage": 111, "Grimsby": 73, "Crawley Town": 114, "Morecambe": 109, "Oldham": 61, "Aston Villa": 1, "Bristol City": 51, "Gravesend": 103, "Huddersfield": 60, "Reading": 33, "Nuneaton Town": 140, "AFC Telford United": 137, "Wycombe": 91, "Leeds": 43, "Colchester": 54, "Rotherham": 63, "Southport": 100, "Southampton": 37, "Darlington": 82, "Blackburn": 16, "Bath City": 133, "Yeovil": 62, "Leyton Orient": 75, "Forest Green": 101, "Chester": 80, "Halifax": 110, "Portsmouth": 11, "Woking": 108, "Histon": 125, "Man City": 7, "Northampton": 78, "Arsenal": 17, "Charlton": 14, "Middlesbrough": 9, "Watford": 41, "Nott'm Forest": 59, "Eastbourne Borough": 131, "Hull": 27, "Barrow": 127, "Doncaster": 52, "Carlisle": 92, "Gillingham": 53, "Accrington": 93, "Dartford": 139, "Altrincham": 112, "Scarborough": 106, "Northwich": 117, "Farsley": 124, "Tamworth": 96, "St. Albans": 119, "Alfreton Town": 136, "Mansfield": 86, "Macclesfield": 76, "Torquay": 87, "Brighton": 26, "Bradford": 56, "Lincoln": 77, "Brentford": 49, "Everton": 3, "Cambridge": 102, "Sheffield United": 35, "Stockport": 85, "Bolton": 2, "Southend": 65, "Cheltenham": 71, "Walsall": 64, "Preston": 42, "Peterboro": 79, "Birmingham": 6, "Boston": 90, "Burton": 97, "West Brom": 8, "Man United": 4, "Stafford Rangers": 118, "Wimbledon": 115, "Scunthorpe": 50, "Kidderminster": 107, "Millwall": 44, "Swansea": 67, "Norwich": 31, "Burnley": 22, "Sunderland": 13, "Sheffield Weds": 40, "Fulham": 5, "Dag and Red": 99, "Oxford": 74, "Stoke": 39, "Tottenham": 12, "Kettering Town": 128, "Coventry": 32, "Wolves": 38, "Port Vale": 66, "Milton Keynes Dons": 57, "Plymouth": 34, "Derby": 25, "Notts County": 88, "Leicester": 36, "Droylsden": 121, "Blackpool": 47, "Salisbury": 122, "Cardiff": 30, "Grays": 98, "Swindon": 46, "Hyde United": 138}