SQL Flashcards

(70 cards)

1
Q

Like

A
  • Used to search for a pattern in a field % match zero, one, or many characters
SELECT name
FROM people
WHERE name LIKE 'Ade%';
  • match a single character
SELECT name
FROM people
WHERE name LIKE E_';
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
2
Q

Not like

A
SELECT name
FROM people
WHERE name NOT LIKE 'A.%';
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
3
Q

Wildcard position

A
SELECT name
FROM people
WHERE name LIKE '%г*;

Returns = A.J. Langer

SELECT name
FROM people
WHERE name LIKE _
_t%';

returns = Anthony

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
4
Q

Where, In

A
WHERE, IN
SELECT title
FROM films
WHERE release_year IN (1920, 1930, 1940);
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
5
Q

Missing values

A

Missing values
* COUNT (field_name) includes only non-missing values
* COUNT (*) includes missing values

null
* Missing values:
* Human error
* Information not available

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
6
Q

Is Not Null

A

IS NOT NULL

SELECT COUNT (*) AS no_birthdates
FROM people
WHERE birthdate IS NOT NULL;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
7
Q

Summarizing data: Aggregate functions

A

AVG() , SUM() , MIN() , MAX() , COUNT ()

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
8
Q

Aggregate functions vs. arithmetic

A

Aggregate functions : aggregates columns
Arithmetic functions : aggregates rows

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
9
Q

Order of execution

A
  • Step 1: FROM
  • Step 2: WHERE
  • Step 3: SELECT (aliases are defined here)
  • Step 4: LIMIT
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
10
Q

Round

A

ROUND (number_to_round, decimal_places)

SELECT ROUND (AVG (budget), 2) AS avg_budget
FROM films
WHERE release_year >= 2010;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
11
Q

HAVING

A
SELECT release_year,
COUNT (title) As title_count
FROM films
GROUP BY release_year
HAVING COUNT(title) > 10;

Aggregate functions have to be in this order, group by and then having. You can’t use count

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
12
Q

USING

A
Inner join of presidents and prime_ministers, joining on country
SELECT p1.country, pl.continent, prime_minister, president
FROM prime_ministers AS p1
INNER JOIN presidents AS p2
USING (country);
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
12
Q

ON

A
Inner join of presidents and prime_ministers, joining on country
SELECT prime_ministers.country, prime_ministers.continent, prime_minister, president
FROM prime_ministers
INNER JOIN presidents
ON prime_minsters.country = presidents. country;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
13
Q

Chaining Joins

A
SELECT
p1. country, p1. continent, president, prime_minister, pm_start
FROM prime ministers as p1
INNER JOIN presidents as p2
USING(country)
INNER JOIN prime_minister_terms as p3
USING (prime_minister);

Chaining joins + and statements

SELECT name, e.year, fertility_rate, unemployment_rate
FROM countries AS c
INNER JOIN populations AS p
ON c. code = p. country_code
INNER JOIN economies AS e
ON c. code = e. code
-- Add an additional joining condition such that you are also joining on year
and p.year = e.year
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
14
Q

Cross Join

A

Returns all possible combinations

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
15
Q

Union Syntax

A

Returns unique values

SELECT *
FROM left table
UNION
SELECT *
FROM right_table;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
16
Q

Union All Syntax

A

Returns duplicate values

SELECT *
FROM left_table
UNION ALL
SELECT *
FROM right_table;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
17
Q

Subqueries in where clause - semi join

A
SELECT
   president,
   country,
   continent 
FROM
   presidents 
WHERE
   country IN 
   (
      SELECT
         country 
      FROM
         states 
      WHERE
         indep _year < 1800
   )
;
  • ~~~

SELECT country,
president
FROM presidents
WHERE continent LIKE ‘%America’
AND country NOT IN
(SELECT country
FROM states
WHERE indep_year < 1800);

* ---------

SELECT *
FROM populations
Filter for only those populations where life expectancy is 1.15 times higher than average
WHERE life _expectancy > 1.15 *
(SELECT AVG (life_expectancy)
FROM populations
WHERE year = 2015)
AND year=2015;
~~~

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
18
Q

Subqueries in select clause

A
SELECT DISTINCT continent,
(SELECT COUNT (*)
FROM monarchs
WHERE states.continent = monarch.continent) AS monarch_count
FROM states;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
19
Q

Subqueries in from clause - self join

A
SELECT DISTINCT monarchs.continent, sub.most_recent
FROM monarchs,
(SELECT
continent,
MAX(indep_year AS most_recent
FROM states
GRAUP BY continent) AS sub
WHERE monarchs.continent = sub.continent

Example 2

select name, country_code, city_proper_pop, metroarea_pop, (city_proper_pop / metroarea_pop 100 as city perc 
from cities
where name IN
( select capital
where continent like %Furone%' or continent LIKE %America')
 and metroarea_pop is not null
order by city_perc DESC
limit 10

example 3
~~~

SELECT
– Select country name and the count match IDs
c. AS country_name,
COUNT(sub) AS matches
FROM country AS c
– Inner join the subquery onto country
– Select the country id and match id columns
inner join (SELECT country_id, id
FROM match
– Filter the subquery by matches with 10+ goals
WHERE (home_goal + away_goal) >= 10 ) AS sub
ON c.country = sub.id
GROUP BY country_name;
~~~

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
20
Q

CASE WHEN

A
SELECT 

CASE
           WHEN hometeam_id = 10189 THEN 'FC Schalke 04'
           WHEN hometeam_id = * 9823 THEN 'FC Bayern Munich' 
					 ELSE 'Other' END AS home team,
					 
					 COUNT(id) AS total_matches
FROM matches germany
GROUP BY the CASE STATEMENT
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
21
Q

True or false: Subqueries in where can only return one column

A

True

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
22
Q

True or false: subqueries in select return a single aggregate value

A

True

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
23
Q

When Condition

A
SELECT date, hometeam_id, awayteam_id,
CASE WHEN hometeam_id = 8455 AND home_goal > away_goal
THEN 'Chelsea home win!'
WHEN awayteam_id = 8455 AND home_goal < away_goal
THEN 'Chelsea away win!'
ELSE 'Loss or tie :(' END AS outcome
FROM match
WHERE hometeam_id = 8455 OR awayteam_id = 8455;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
24
Case when in select statement
``` SELECT season, COUNT(CASE WHEN hometeam id = 8650 AND home_goal > away_goal THEN id END) AS home_wins FROM match GROUP BY season; ```
25
Window Function : Over clause
``` SELECT date, (home_goal + away_goal) AS goals, AVG (home_goal + away_goal) OVER() AS overall_avg FROM match WHERE season = '2011/2012'; ```
26
string_agg
Enter STRING AGG * STRING_AGG (column, separator) takes all the values of a column and concatenates them, with separator in between each value STRING_AGG (Letter, ', ") transforms this...
27
Over with a partition
``` SELECT date, (home_goal + away_goal) AS goals, AVG (home_goal + away_goal) OVER(PARTITION BY season) AS season_avg FROM match; ```
28
Partition with multiple columns
``` SELECT c. name, m. season, (home_goal + away_goal) AS goals, AVG (home_goal + away_goal) OVER(PARTITION BY m.season, c.name) AS season_ctry_avg FROM country AS c LEFT JOIN match AS m on c.id = m.country_id ```
29
Sliding Windows
Sliding window frame ``` -- Manchester City Home Games SELECT date, home_goal, away_goal, SUM (home_goal) OVER(ORDER BY date ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS last2 FROM match WHERE hometeam_ id = 8456 AND season = '2011/2012'; ``` Sliding window keywords ROWS BETWEEN AND PRECEDING FOLLOWING UNBOUNDED PRECEDING UNBOUNDED FOLLOWING CURRENT ROW
30
CTE
``` -- Set up the home team CTE WITH home AS ( SELECT m.id, t.team_long_name, CASE WHEN m.home_goal > m.away_goal THEN 'MU Win' WHEN m.home_goal < m.away_goal THEN 'MU Loss' ELSE 'Tie' END AS outcome FROM match AS m LEFT JOIN team AS t ON m.hometeam_id = t.team_api_id), -- Set up the away team CTE away AS ( SELECT m.id, t.team_long_name, CASE WHEN m.home_goal > m.away_goal THEN 'MU Loss' WHEN m.home_goal < m.away_goal THEN 'MU Win' ELSE 'Tie' END AS outcome FROM match AS m LEFT JOIN team AS t ON m.awayteam_id = t.team_api_id) -- Select team names, the date and goals SELECT DISTINCT m.date, home.team_long_name AS home_team, away.team_long_name AS away_team, m.home_goal, m.away_goal -- Join the CTEs onto the match table FROM match AS m LEFT JOIN home ON m.id = home.id LEFT JOIN away ON m.id = away.id WHERE m.season = '2014/2015' AND (home.team_long_name = 'Manchester United' OR away.team_long_name = 'Manchester United'); ```
31
Window function : Row number
``` SELECT Year, Event, Country, ROW_NUMBER() OVER () AS Row_N FROM Summer_Medals WHERE Medal = 'Gold'; ``` OR ``` SELECT Year, -- Assign numbers to each year row_number() Over() AS Row_N FROM ( SELECT year FROM Summer_Medals ORDER BY Year ASC ) AS Years ORDER BY Year ASC; ``` OR Here the biggest will be assigned a rank of 1 ``` _SELECT Year, Event, Country, ROW NUMBER() OVER (ORDER BY Year DESC, Event ASC) AS ROW_N FROM Summer_Medals WHERE Medal = 'Gold'; ```
32
Window function: LAG
Lag lets you get the last value , like creating a table with this years winner vs last years ``` WITH Discus_Gold AS ( SELECT Year, Country AS Champion FROM Summer_Medals WHERE Year IN (1996, 2000, 2004, 2008, 2012) AND Gender = 'Men' AND Medal = 'Gold' AND Event = 'Discus Throw') SELECT Year, Champion, LAG (Champion, 1) OVER (ORDER BY Year ASC) AS Last_Champion FROM Discus_Gold ORDER BY Year ASC; ```
33
Window function: Partition
Finding last years champion for example: ``` WITH Tennis_Gold AS ( SELECT DISTINCT Gender, Year, Country FROM Summer_Medals WHERE Year >= 2000 AND Event = 'Javelin Throw' AND Medal = 'Gold') SELECT Gender, Year, Country AS Champion, -- Fetch the previous year's champion by gender LAG(Country,1) OVER (PARTITION BY gender ORDER BY Year ASC) AS Last_Champion FROM Tennis_Gold ORDER BY Gender ASC, Year ASC; ``` to partition more that one thing : partition by gender,event
34
Window Functions: Fetching
The four functions Relative * LAG (column, n) returns column's value at the row n rows before the current row * LEAD (column, n) returns column's value at the row n rows after the current row Absolute * FIRST_VALUE (column) returns the first value in the table or partition * LAST VALUE (column) returns the last value in the table or partition EXAMPLE A = LEAD Query ``` WITH Hosts AS ( SELECT DISTINCT Year, City FROM Summer_Medals) SELECT Year, City, LEAD (City, 1) OVER (ORDER BY Year ASC) AS Next_City, LEAD (City, 2) OVER (ORDER BY Year ASC) AS After_Next_City FROM Hosts ORDER BY Year ASC; ``` EXAMPLE B = ``` SELECT Year, City, FIRST_VALUE (City) OVER (ORDER BY Year ASC) AS First_City, LAST_VALUE(City) OVER ( ORDER BY Year ASC RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Last_City FROM Hosts ORDER BY Year ASC; ``` EXAMPLE C : ``` WITH Hosts AS ( SELECT DISTINCT Year, City FROM Summer_Medals) SELECT Year, City, -- Get the last city in which the Olympic games were held Last_value(city) OVER ( ORDER BY year ASC RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS Last_City FROM Hosts ORDER BY Year ASC; ```
35
Window function: ranking
The ranking functions * ROW_NUMBER() always assigns unique numbers, even if two rows' values are the same * RANK() assigns the same number to rows with identical values, skipping over the next numbers in such cases * DENSE_RANK() also assigns the same number to rows with identical values, but doesn't skip over the next numbers ``` WITH Country_Games AS (...) SELECT Country, Games, ROW NUMBER () OVER (ORDER BY Games DESC) AS ROW_N FROM Country_Games ORDER BY Games DESC, Country ASC; ```
36
Window function: Paging/Frames
What is paging? * Paging: Splitting data into (approximately) equal chunks * Uses * Many APIs return data in "pages" to reduce data being sent * Separating data into quartiles or thirds (top middle 33%, and bottom thirds) to judge performance Enter NTILE * NTILE(n) splits the data into n approximately equal pages ``` WITH Country_Medals AS ( SELECT Country, COUNT(* AS Medals FROM Summer_Medals GROUP BY Country), SELECT Country, Medals, NTILE (3) OVER (ORDER BY Medals DESC) AS Third FROM Country_Medals; ``` The you can group the results ``` WITH Country_Medals AS (...), Thirds AS ( SELECT Country, Medals, NTILE (3) OVER (ORDER BY Medals DESC AS Third FROM Country_Medals) SELECT Third, ROUND(AVG (Medals), 2) AS Avg_Medals FROM Thirds GROUP BY Third ORDER BY Third ASC; ```
37
Window Functions: Aggregation
``` WITH Brazil_Medals AS (...) SELECT Year, Medals, MAX(Medals) OVER (ORDER BY Year ASC) AS Max_ Medals FROM Brazil_Medals; ```
38
Window Functions : Frames
ROWS BETWEEN * ROWS BETWEEN [START AND [FINISH] o n PRECEDING: n rows before the current row * CURRENT ROW: the current row o n FOLLOWING : n rows after the current row Examples * ROWS BETWEEN 3 PRECEDING AND CURRENT ROW = 4 rows * ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING = 3 rows * ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING = 5 rows ``` WITH Russia_Medals AS (...) SELECT Year, Medals, MAX(Medals) OVER (ORDER BY Year ASC) AS Max_Medals, MAX (Medals) OVER ORDER BY Year ASC ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS Max_Medals_Last FROM Russia_Medals ORDER BY Year ASC; ```
39
Window Function: Moving Averages/Sum
* Moving average (MA): Average of last n periods * Example: 10-day MA of units sold in sales is the average of the last 10 days' sold units * Used to indicate momentum/trends * Also useful in eliminating seasonality * Moving total: Sum of last n periods * Example: Sum of the last 3 Olympic games' medals * Used to indicate performance; if the sum is going down, overall performance is going down ``` WITH US Medals AS (...) SELECT Year, Medals, AVG(Medals) OVER (ORDER BY Year ASC ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS Medals_MA FROM US_Medals ORDER BY Year ASC; ``` Note moving averages cant see into the future, so its always going to end on current row
40
Pivots/Crosstab
Enter CROSSTAB ``` CREATE EXTENSION IF NOT EXISTS tablefunc; SELECT * FROM CROSSTAB ($$ source_sql TEXT $$) AS ct (column_1 DATA_TYPE_1, column_2 DATA_TYPE_2, * . . , column_n DATA_TYPE_N); ``` Example ``` CREATE EXTENSION IF NOT EXISTS tablefunc; SELECT * FROM CROSSTAB($$ SELECT Country, Year, COUNT(*) :: INTEGER AS Awards FROM Summer_Medals WHERE Country IN ('CHN', 'RUS', 'USA') AND Year IN (2008, 2012) AND Medal = 'Gold' GROUP BY Country, Year ORDER BY Country ASC, Year ASC; $$) AS ct (Country VARCHAR, "2008" INTEGER, "2012" INTEGER ORDER BY Country ASC; ```
41
ROLLUP and CUBE
``` SELECT Country, Medal, COUNT(*) AS Awards FROM Summer_Medals WHERE Year = 2008 AND Country IN ('CHN', 'RUS') GROUP BY Country, ROLLUP(Medal) ORDER BY Country ASC, Medal ASC; ``` * ROLLUP is a GROUP BY subclause that includes extra rows for group-level aggregations * GROUP BY Country, ROLLUP(Medal) will count all Country - and Medal -level totals, then count only Country -level totals and fill in Medal with nulls for these rows Enter CUBE ``` SELECT Country, Medal, COUNT(* AS Awards FROM summer_medals WHERE Year = 2008 AND Country IN ('CHN', 'RUS') GROUP BY CUBE(Country, Medal) ORDER BY Country ASC, Medal ASC; ``` * CUBE is a non-hierarchical ROLLUP * It generates all possible group-level aggregations * CUBE (Country, Medal) counts Country -level, Medal -level, and grand totals
42
COALESCE
Enter COALESCE * COALESCE() takes a list of values and returns the first non- null value, going from left to right * COALESCE (null, null, 1, null, 2) ? 1 * Useful when using SQL operations that return nulls * * * ROLLUP and CUBE * * * Pivoting * * * LAG and LEAD ``` SELECT COALESCE(Country, 'Both countries') AS Country, COALESCE (Medal, 'All medals') AS Medal, COUNT (*) AS Awards FROM summer_medals WHERE Year = 2008 AND Country IN ('CHN', 'RUS') GROUP BY ROLLUP(Country, Medal) ORDER BY Country ASC, Medal ASC; ``` it replaces the nulls with 'both countries' and 'all medals' respectively
43
STRING_AGG
STRING_AGG * STRING_AGG (column, separator) takes all the values of a column and concatenates them, with separator in between each value STRING_AGG(Letter, ", ") transforms this...
44
How to query for data type
Determining data types from existing tables ``` SELECT column_name, data_type FROM INFORMATION_SCHEMA.COLUMNS WHERE column name in ('title', 'description', 'special_features') AND table_name ='film'; ```
45
INTERVAL data types
``` SELECT rental_date + INTERVAL '3 days' as expected_return from rental ```
46
Creating Tables in Postgres
Before we get started, creating tables in sql ``` CREATE TABLE example CREATE TABLE my_first_table ( first_column text, second_column integer); INSERT example INSERT INTO my_first_table (first_column, second_column) VALUES ('text value', 12); ``` creating tables in postgress, supports arrays ``` CREATE TABLE grades ( student_id int, email text[][], test_scores int[] ); ``` Note that PostgreSQL array indexes start with one and not zero.
47
The ANY function in Postgress
The ANY function allows you to search for a value in any index position of an ARRAY. Here's an example. `WHERE 'search text' = ANY(array_name)` The contains operator @> operator is alternative syntax to the ANY function and matches data in an ARRAY using the following syntax. ``` WHERE array_name @> ARRAY['search text'] :: type[] WHERE special_features @> ARRAY['Deleted Scenes']; ```
48
DATES in Postgres
Calculating time periods with AGE `SELECT AGE(timestamp '2005-09-11 00:00:00', timestamp '2005-09-09 12:00:00');` ``` SELECT INTERVAL '1' day * timestamp '2019-04-10 12:34:56' ``` Retrieving the current timestamp `SELECT NOW::timestamp;` `SELECT CAST(NOW() as timestamp);` ``` SELECT CURRENT_ TIMESTAMP (2); ``` CURRENT DATE `SELECT CURRENT_DATE;` Extracting and transforming date / time data * EXTRACT field FROM source ``` SELECT EXTRACT (quarter FROM timestamp 2005-01-24 05:12:00') AS quarter; ``` * DATE_PART('field', source) ``` SELECT DATE_PART ('quarter', timestamp '2005-01-24 05:12:00') AS quarter; ``` EXAMPLE A ``` SELECT EXTRACT (quarter FROM payment_date) AS quarter, EXTRACT (year FROM payment_date) AS year, SUM (amount) AS total_payments FROM payment GROUP BY 1, 2; ``` Truncating timestamps using DATE_TRUNCO `SELECT DATE_TRUNC ('year', TIMESTAMP 2005-05-21 15:30:30');`
49
Postgres Date
``` SELECT c.first_name || ' ' |I c. last_name AS customer_name, f.title, r.rental_date, - - Extract the day of week date part from the rental_date EXTRACT (dow FROM r. rental_date) AS dayofweek, AGE (r.return_date, r.rental_date) AS rental_days, - Use DATE_TRUNC to get days from the AGE function CASE WHEN DATE_TRUNC(' day', AGE(r. return_date, r.rental_date)) > - - Calculate number of d f.rental_duration * INTERVAL '1' day THEN TRUE ELSE FALSE END AS past_due FROM film AS f INNER JOIN inventory AS i ON f.film_id = i.film_id INNER JOIN rental AS r ION i.inventory_id = r. inventory_id INNER JOIN customer AS c ON c.customer_id = r. customer_id WHERE - Use an INTERVAL for the upper bound of the rental_date r.rental_date BETWEEN CAST ('2005-05-01' AS DATE) AND CAST ('2005-05-01' AS DATE) + INTERVAL '90 day'; ```
50
String Concatination
``` SELECT first_name, last_name, first_name |1 ' ' I last_name AS full_name FROM customer ``` String concatenation with functions ``` SELECT CONCAT (first_name,' ', last_name) AS full_name FROM customer; ```
51
String Manipulation: Upper, lower, Title caps, replace
``` SELECT UPPER (email) FROM customer; ``` Lower, initcap Replacing characters in a string ``` SELECT REPLACE(description, 'A Astounding', 'An Astounding') as description FROM film; ```
52
Advanced String Functions
Char Length = length of characters in a string ``` SELECT title, CHAR_LENGTH(title) or length(title)l FROM film; ``` Index/position of a character in a string ``` SELECT email, POSITION('@' IN email) FROM customer; ``` Parsing : extract first n position, you can also use left ``` SELECT RIGHT (description, 50) FROM film; ``` Substring ``` SELECT FROM SUBSTRING(description, 10, 50) film AS f; ``` Extracting substrings of character data ``` SELECT SUBSTRING (email FROM 0 FOR POSITION('@' IN email)) FROM customer; ``` Extracting substrings of character data ``` SELECT FROM SUBSTRING(email FROM POSITION('@' IN email)+1 FOR CHAR_LENGTH (email)) customer ``` EXAMPLE : ``` SELECT UPPER(c.name) || ': ' || f.title AS film_category, -- Truncate the description without cutting off a word left(description, 50 - -- Subtract the position of the first whitespace character position( ' ' IN REVERSE(LEFT(description, 50)) ) ) FROM film AS f INNER JOIN film_category AS fc ON f.film_id = fc.film_id INNER JOIN category AS c ON fc.category_id = c.category_id; ```
53
Truncate, replace, over write
Trimming white space ``` SELECT TRIM(' padded"); ``` `TRIM([leading | trailing | both] [characters] from string)` Padding strings with character data `SELECT LPAD('padded', 10, '#');` concat & pad --result = Noor Habbal ``` SELECT first_name || lpad(last_name, LENGTH(last_name)+1) AS full_name FROM customer; ```
54
Full text search
What is full-text search? Full text search provides a means for performing natural language queries of text data in your database. * Stemming * Spelling mistakes * Ranking LIKE versus full-text search ``` SELECT title, description FROM film WHERE to_tsvector (title) @@ to_tsquery ('elf'); ``` ``` SELECT title, description, -- Calculate the similarity similarity(description, 'Astounding & Drama') FROM film WHERE to_tsvector(description) @@ to_tsquery('Astounding & Drama') ORDER BY similarity(description, 'Astounding & Drama') DESC; ``` ## Footnote tsvector vectorizes a word, to_tsvector does a count of each word ' china':90
55
creating functions
User-defined data types Enumerated data types ``` CREATE TYPE dayofweek AS ENUM ( 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ): ``` query user-defined types ``` SELECT typname, typcategory FROM pg_type WHERE typname='dayofweek'; ``` get more information about a table ``` SELECT column_name, data_type, udt_name FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name ='film'; ``` User-defined functions ``` CREATE FUNCTION squared(i integer) RETURNS integer AS $$ BEGIN RETURN i * i; END; $$ LANGUAGE plpgsal; ``` -- Select the column name, data type and udt name columns ``` SELECT column_name, data_type, udt_name FROM INFORMATION_SCHEMA.COLUMNS -- Filter by the rating column in the film table WHERE table_name ='film' AND column_name='rating'; ``` pg_type is the postgress catalog table, which holds information about all the datatypes in the databse
56
Postgres extentions
PostgreSQL extensions - Commonly used extensions * PostGIS * PostPic * fuzzystrmatch * pg_trgm Available Extensions ``` SELECT name FROM pg_available_extensions; ``` Installed Extensions ``` SELECT extname FROM pg_extension; ``` --Enable the fuzzystrmatch extension ``` CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; ``` --Confirm that fuzzstrmatch has been enabled `SELECT extname FROM pg_extension;` `SELECT * FROM pg_extension;` Using fuzzystrmatch or fuzzy searching ``` SELECT levenshtein('GUMBO', 'GAMBOL'); ``` Compare two strings with pg_trgm `SELECT similarity ('GUMBO', 'GAMBOL');`
57
Reminders of NULL, IS NULL, count(col), count(*)
NULL = missing IS NULL, IS NOT NULL don't use = NULL count (*) = number of rows count (column_name) = number of non- NULL values count (DISTINCT column_name) = number of different non- NULL values SELECT DISTINCT column_name = distinct values, including NULL
58
Coalesce function
Return the first non-null value in a list Coalesce(col2, col3) `SELECT coalesce(industry, sector, 'Unknown') AS industry2,` so this code checks industry, if the row is null, it replaces it with the value from sector, if that's null, it replaces it with unknown
59
Casting
Format -- With the CAST function SELECT CAST (value AS new_type);
60
Column constraints
* Foreign key: value that exists in the referenced column, or NULL * Primary key: unique, not NULL * Unique: values must all be different except for NULL * Not null: NULL not allowed: must have a value * Check constraints: conditions on the values o column1 > 0 * columnA > columnB
61
Summary Statistics
``` -- Summarize by group with GROUP BY SELECT tag, min (question_pct), avg (question_pct), max (question_pct) FROM stackoverflow GROUP BY tag; ``` standard deviation ``` SELECT stddev(question_pct) FROM stackoverflow; ``` Sample Variance , divides by the number of values -1 ``` SELECT var_samp(question_pct) FROM stackoverflow; ``` population variance, difference is -1 ``` SELECT var_pop (question_pot) FROM stackoverflow; ```
62
Summary function : Truncating
Truncate `SELECT trunc (42.1256, 2);` this is great is you're counting and the values in the column are like 21, 22, 23. So you can use truncate to do truncate -1, and they'll all be grouped in the 20 bin. Esentially you can use it to examine distributions ``` -- Truncate employees SELECT trunc(employees, -4) AS employee_bin, -- Count number of companies with each truncated value count(title) FROM fortune500 WHERE employees < 100000 GROUP BY employee_bin ORDER BY employee_bin; ```
63
Summary function : Generating series and creating bins
``` - - Create bins WITH bins AS ( SELECT generate_series (30, 60, 5) AS lower, generate_series (35, 65, 5) AS upper ), -- Subset data to tag of interest ebs AS ( SELECT Unanswered_count FROM stackoverflow WHERE tag = 'amazon-ebs' ) - - Count values in each bin SELECT lower, upper, count (unanswered_count) -- left join keeps all bins FROM bins LEFT JOIN ebs ON unanswered_count >= lower AND Unanswered_count < upper ``` 1) find the min and max, but using the min and max functions in sql query 2) use generate series to create the bins 3) then put the values in the query above
64
Summary Functions : Corr,Median/ Percentile functions
correlation function ``` SELECT corr(assets, equity) FROM fortune500; ``` Median/ Percentile functions - lets say you want the top 20% of , so you take the .8 percentile -- percentile between 0 and 1 EXAMPLE = col A , values = 1345 ``` SELECT percentile_disc(percentile) WITHIN GROUP (ORDER BY column_name) FROM table; ``` ^ returns 3 ``` SELECT percentile_cont (percentile) WITHIN GROUP (ORDER BY column_name) FROM table; ``` ^ returns 3.5 ``` create temp table profit80 AS SELECT sector, percentile_disc(.8) within group (order by profits) AS pct80 from fortune500 group by sector; ```
65
Create temp table, drop table
``` Create Temp Table Syntax - - Create table as CREATE TEMP TABLE new_tablename AS -- Query results to store in the table SELECT column1, column2 FROM table; ``` `DROP TABLE IF EXISTS top_companies;` EXAMPLE: ``` DROP TABLE IF EXISTS profit80; CREATE TEMP TABLE profit80 AS SELECT sector, percentile_disc(0.8) WITHIN GROUP (ORDER BY profits) AS pct80 FROM fortune500 GROUP BY sector; SELECT title, fortune500.sector, profits, profits/pct80 AS ratio FROM fortune500 LEFT JOIN profit80 ON fortune500.sector =profit80.sector WHERE profits > pct80; ```
66
Interesting correlation table
``` DROP TABLE IF EXISTS correlations; CREATE TEMP TABLE correlations AS SELECT 'profits'::varchar AS measure, corr(profits, profits) AS profits, corr(profits, profits_change) AS profits_change, corr(profits, revenues_change) AS revenues_change FROM fortune500; INSERT INTO correlations SELECT 'profits_change'::varchar AS measure, corr(profits_change, profits) AS profits, corr(profits_change, profits_change) AS profits_change, corr(profits_change, revenues_change) AS revenues_change FROM fortune500; INSERT INTO correlations SELECT 'revenues_change'::varchar AS measure, corr(revenues_change, profits) AS profits, corr(revenues_change, profits_change) AS profits_change, corr(revenues_change, revenues_change) AS revenues_change FROM fortune500; -- Select each column, rounding the correlations SELECT measure, ROUND(profits::numeric, 2) AS profits, ROUND(profits_change::numeric, 2) AS profits_change, ROUND(revenues_change::numeric, 2) AS revenues_change FROM correlations; ``` this creates a table like measure ---- profits ---- profits_change profits ----- 1 ------ 2 profits_change - 1 ------ 2
67
Exploring categorical data and unstructured text
``` -- To clear table if it already exists DROP TABLE IF EXISTS indicators; -- Create the temp table CREATE TEMP TABLE indicators AS SELECT id, CAST (description LIKE '%@%' AS integer) AS email, CAST (description LIKE '%___-___-____%' AS integer) AS phone FROM evanston311; -- Select the column you'll group by SELECT priority, -- Compute the proportion of rows with each indicator SUM(email)/COUNT(*)::numeric AS email_prop, SUM(phone)/COUNT(*)::numeric AS phone_prop -- Tables to select from FROM evanston311 left JOIN indicators -- Joining condition ON evanston311.id=indicators.id -- What are you grouping by? GROUP BY priority; ``` ________________________________________________________________________
SELECT*FROM fruit WHERE lower(fav_fruit)='apple';
case insensitive = LIKE %apple%
trimming trim : bothendstrim(' abc ')='abc' rtrim ltrim SELECTtrim('Wow!', '!'); Wow SELECTtrim('Wow!', '!wW'); o
subbstring SELECT left('abcde', 2), = ab
Substring SELECT substring(string FROM start FOR length) SELECTsubstring('abcdef' FROM 2 FOR 3) = bcd
split part SELECT split_part('a,bc,d', ',', 2); bc
concat SELECT concat('a', 2, 'cc'); SELECT'a'||2||'cc'
``` -- Case for each of :, -, and I SELECT CASE WHEN category LIKE '%: %' THEN split_part(category, ': ', 1) WHEN category LIKE '% - %' THEN split_part (category, ' - ', 1) ELSE split_part(category, ' | ', 1) END AS major_category, -- alias the result sum (businesses) - - also select number of businesses FROM naics GROUP BY major_category; -- Group by categories created above ```
68
dates
Compare the number of requests created per month to the number completed. -- Compute monthly counts of requests created WITH created AS ( SELECT date_trunc('month', date_created) AS month, count(*) AS created_count FROM evanston311 WHERE category='Rodents- Rats' GROUP BY month), -- Compute monthly counts of requests completed completed AS ( SELECT date_trunc('month',date_completed) AS month, count(*) AS completed_count FROM evanston311 WHERE category='Rodents- Rats' GROUP BY month) -- Join monthly created and completed counts SELECT created.month, created_count, completed_count FROM created INNER JOIN completed ON created.month=completed.month ORDER BY created.month;
69
grouping sets : OLAP operators includes rollup and cube
GROUP BY GROUPING SETS Example of a query with GROUPING SETS operator: ``` SELECT country, genre, COUNT (*) FROM rentings_extended GROUP BY GROUPING SETS ((country, genre), (country), (genre), ()); ``` * Column names surrounded by parentheses represent one level of aggregation. *** GROUP BY GROUPING SETS returns a UNION over several GROUP BY queries.** equivalant to a group by cube (country and genre)