Browse code

Add scripts to download NHL game data

Joseph Weston authored on 07/02/2021 19:46:30
Showing 5 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,15 @@
1
+# NHL stats
2
+
3
+This repo will contain scripts for dumping full NHL game feeds.
4
+
5
+## API URLs
6
+
7
+This page contains docs on the API: https://gitlab.com/dword4/nhlapi/-/blob/master/stats-api.md
8
+
9
+The main API endpoint is https://statsapi.web.nhl.com/api/v1
10
+
11
+The important endpoints are /game/ID/feed/live which contains the full game info, including all plays,
12
+and /game/ID/linescore which contains aggregate statistics for each player during the match.
13
+
14
+Game discovery is via /schedule query strings are used to get schedule data between certain dates
15
+or for a whole season with: ?season=20172018
0 16
new file mode 100755
... ...
@@ -0,0 +1,77 @@
1
+#!/usr/bin/env python
2
+"""Download full game feeds."""
3
+
4
+import argparse
5
+import asyncio
6
+from functools import partial
7
+from pathlib import Path
8
+import sys
9
+
10
+import backoff
11
+import httpx
12
+import msgpack
13
+import tqdm
14
+
15
+api_url = "https://statsapi.web.nhl.com/api/v1{}".format
16
+
17
+
18
+def get_game_guids(season: int) -> list[int]:
19
+    path = Path(f"seasons/{season}{season+1}.msgpack")
20
+    if not path.exists():
21
+        raise ValueError(f"No game data for {season} season")
22
+
23
+    with path.open("rb") as f:
24
+        season_data = msgpack.load(f)
25
+
26
+    # TODO: replace with JSONPath query
27
+    return [
28
+        g["gamePk"]
29
+        for d in season_data["dates"]
30
+        for g in d["games"]
31
+        if g["status"]["abstractGameState"] == "Final"
32
+    ]
33
+
34
+
35
+@backoff.on_exception(
36
+    backoff.expo,
37
+    (httpx.ConnectTimeout, httpx.ConnectError),
38
+    max_time=120,
39
+)
40
+async def get_game_feed(client: httpx.Client, guid: int) -> dict:
41
+    resp = await client.get(api_url(f"/game/{guid}/feed/live"))
42
+    return resp.json()
43
+
44
+
45
+def feed_downloaded(guid: int) -> bool:
46
+    return Path(f"games/{guid}.msgpack").exists()
47
+
48
+
49
+def save_game_feed(feed):
50
+    guid = feed["gamePk"]
51
+    with open(f"games/{guid}.msgpack", "wb") as f:
52
+        msgpack.dump(feed, f)
53
+
54
+
55
+async def main(season: int, force: bool = False):
56
+    guids = [
57
+        guid for guid in get_game_guids(season=season) if not feed_downloaded(guid)
58
+    ]
59
+    if not guids and not force:
60
+        print(f"Games for {season} season already downloaded", file=sys.stderr)
61
+        return
62
+
63
+    c = httpx.AsyncClient(
64
+        limits=httpx.Limits(max_keepalive_connections=5, max_connections=10)
65
+    )
66
+    game_feeds = map(partial(get_game_feed, c), guids)
67
+    monitor = partial(tqdm.tqdm, desc=f"{season} season games", total=len(guids))
68
+    async with c:
69
+        for feed_coro in monitor(asyncio.as_completed(game_feeds)):
70
+            save_game_feed(await feed_coro)
71
+
72
+
73
+if __name__ == "__main__":
74
+    parser = argparse.ArgumentParser(description="Download NHL game feeds")
75
+    parser.add_argument("season", type=int, help="Which season to download")
76
+    args = parser.parse_args()
77
+    asyncio.run(main(args.season))
0 78
new file mode 100755
... ...
@@ -0,0 +1,19 @@
1
+#!/bin/bash
2
+
3
+let season_start="$1"
4
+
5
+if [[ -z $season_start ]]; then
6
+    echo "Expected Season (e.g. 2008), but none provided" >&2
7
+    exit 1
8
+fi
9
+
10
+# 2008 -> 20082009 as required by NHL API
11
+let season=$season_start$season_start+1
12
+url="https://statsapi.web.nhl.com/api/v1/schedule?season=$season"
13
+outfile="$season.json"
14
+
15
+echo
16
+echo "Hitting $url and saving to $season.msgpack"
17
+echo
18
+
19
+curl $url | ./json2msgpack > seasons/$season.msgpack
0 20
new file mode 100644
... ...
@@ -0,0 +1,13 @@
1
+name: nhl-stats
2
+channels:
3
+    - default
4
+    - conda-forge
5
+dependencies:
6
+    - python
7
+    - ipython
8
+    - ipykernel
9
+    # Actual deps
10
+    - backoff
11
+    - httpx
12
+    - msgpack-python
13
+    - tqdm
0 14
new file mode 100755
... ...
@@ -0,0 +1,7 @@
1
+#!/usr/bin/env -S python -u
2
+
3
+import json
4
+import sys
5
+import msgpack
6
+
7
+msgpack.dump(json.load(sys.stdin), sys.stdout.buffer)