How I learned to stop worring and love the join
Created by Dan Vatterott / @dvatterott
Read more here
SELECT
*
FROM
my_table
LIMIT 5
Customer_id | listen_date | artist | artist_id | song_id |
---|---|---|---|---|
1 | "2017-11-01" | "A Tribe Called Quest" | 5 | 1 |
2 | "2017-11-01" | "A Tribe Call Quest" | 5 | 7 |
2 | "2017-11-02" | "Wu-Tang Clan'" | 1 | 7 |
Create a list of people who listened to "A Tribe Called Quest".
SELECT
DISTINCT user_id
FROM
listen_table
WHERE
artist = "A Tribe Called Quest"
Create a list of people who listened to bands whose names start with "The".
SELECT
DISTINCT user_id
FROM
listen_table
WHERE
artist LIKE "The%"
Note: sql alchemy (python api) gets angry about % signs (use %%)
Create a list of the 10 people that listened to the most songs by "A Tribe Called Quest".
SELECT
user_id,
COUNT(song_id) AS song_count
FROM
listen_table
WHERE
artist = "A Tribe Called Quest"
GROUP BY
user_id
ORDER BY
song_count DESC
LIMIT 10
Create a list of the people that listened to more than 10 songs by "A Tribe Called Quest".
SELECT
user_id,
COUNT(song_id) AS song_count
FROM
listen_table
WHERE
artist = "A Tribe Called Quest"
GROUP BY
user_id
HAVING
COUNT(song_id) > 10
Note: Can't use select alias in where/having :/
This is an outer join.
Concentrate on what happens to missing/repeated values.
artist_id | artist | member_count | music_genre |
---|---|---|---|
5 | "A Tribe Called Quest" | 3 | "Hip Hop" |
1 | "Wu-Tang Clan" | 9 | "Hip Hop" |
2 | "Fugees" | 3 | "R and B" |
Create a list of the 10 people who listened to the most "Hip Hop".
SELECT
lt.user_id,
COUNT(DISTINCT lt.song_id ) AS listen_count
FROM
listen_table AS lt
INNER JOIN artist_table AS at
ON lt.artist_id = at.artist_id
WHERE
at.music_genre = "Hip Hop"
GROUP BY
lt.user_id
ORDER BY
listen_count
LIMIT 10
Create a list of the largest group (# members) in each genre.
SELECT
genre,
artist,
member_count
FROM
(SELECT
genre,
MAX(member_count) AS max_count
FROM
artist_table
GROUP BY
genre
) AS lt
INNER JOIN artist_table AS at
ON lt.genre = at.genre AND lt.max_count = at.member_count
Think about what happens with ties?
Create a list of the largest group (# members) in each genre.
WITH temp_table AS (
SELECT
genre,
MAX(member_count) AS max_count
FROM
artist_table
GROUP BY
genre
)
SELECT
genre,
artist,
member_count
FROM
temp_table AS tt
INNER JOIN artist_table AS at
ON tt.genre = at.genre AND tt.max_count = at.member_count
Create a list of the largest group (# members) in each genre.
SELECT
genre,
artist,
member_count
FROM
artist_table
WHERE
rank() OVER (PARTITION BY genre ORDER BY member_count DESC) = 1
Create a list of people who listened to "A Tribe Called Quest" and "Fugees".
WITH temp_table AS (
SELECT
DISTINCT user_id AS user_id
FROM
listen_table
WHERE
artist = 'A Tribe Called Quest'
)
SELECT
DISTINCT user_id AS user_id
FROM
listen_table
WHERE
artist = 'Fugees'
AND
user_id IN (SELECT user_id FROM temp_table)
Create a list of people who listened to "A Tribe Called Quest" and "Fugees".
WITH temp_table AS (
SELECT
DISTINCT user_id AS user_id
FROM
listen_table
WHERE
artist = 'A Tribe Called Quest'
)
SELECT
DISTINCT user_id AS user_id
FROM
listen_table AS lt
WHERE
artist = 'Fugees'
AND
EXISTS (
SELECT
1
FROM
temp_table
WHERE
temp_table.user_id = lt.user_id)
Create a list of people who listened to "A Tribe Called Quest" and "Fugees".
WITH tribe_table AS (
SELECT
DISTINCT user_id AS user_id
FROM
listen_table
WHERE
artist = 'A Tribe Called Quest'
)
WITH fugee_table AS (
SELECT
DISTINCT user_id AS user_id
FROM
listen_table
WHERE
artist = 'Fugees'
)
SELECT
user_id
FROM
tribe_table AS tt
JOIN fugee_table AS ft
ON tt.user_id = ft.user_id
Look, a self join!
Count whether more songs are listened to on the weekends or weekdays.
SELECT
CASE
WHEN extract('dow' from listen_date) > 5 THEN 'weekend'
ELSE 'weekday'
END AS weekend_or_not,
COUNT(*)
FROM
listen_table AS lt
GROUP BY
weekend_or_not
Create a list of users and the first artist they listened to.
WITH temp_table AS (
SELECT
user_id,
artist,
rank() OVER (PARTITION BY user_id ORDER BY listen_date ASC)
FROM
listen_table
)
SELECT
user_id,
artist
FROM
temp_table
WHERE
rank = 1
Temp tables are faster when many tables. Joins are faster when using fewer tables.
The way that SQL handles null values is often not how you would expect.
SELECT
genre,
artist,
list_count
FROM
(SELECT
genre,
artist,
COUNT(*) AS listen_count
FROM
listen_table
GROUP BY
genre, artist_id
) AS lt
WHERE
rank() OVER (PARTITION BY genre ORDER BY listen_count DESC) = 1
Assumes each artist only has one genre listing.