Consultas de Selección: Group By, Having, Union, Intersect, Except, Any¶
Conexión a la Base de Datos¶
[1]:
import pandas as pd
import psycopg2
import sqlalchemy
import matplotlib as plt
%matplotlib inline
from sqlalchemy import create_engine
POSTGRES_ADDRESS = 'localhost' # Este es el servidor, puede ser una IP
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'postgres'
POSTGRES_PASSWORD = '1234'
POSTGRES_DBNAME = 'dvdrental'
# Ahora se configura la cadena de conexión.
# Esta es una cadena que contiene los parámetros necesarios para establecer una
# conexión con posgres.
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'.format(username=POSTGRES_USERNAME,
password=POSTGRES_PASSWORD,
ipaddress=POSTGRES_ADDRESS,
port=POSTGRES_PORT,
dbname=POSTGRES_DBNAME))
# A continuación se crea la conexión
cnx = create_engine(postgres_str)
La cláusula Group By¶
La cláusula GROUP BY divide las filas devueltas de la declaración SELECT en grupos. Para cada grupo, puede aplicar una función agregada, por ejemplo, SUM() para calcular la suma de elementos o COUNT() para obtener el número de elementos en los grupos.
[2]:
sql = '''
SELECT
customer_id,
SUM (amount) as total_pago,
AVG (amount) as promedio_pago
FROM
payment
GROUP BY
customer_id;
'''
pd.read_sql_query(sql, cnx)
[2]:
customer_id | total_pago | promedio_pago | |
---|---|---|---|
0 | 184 | 80.80 | 4.040000 |
1 | 87 | 137.72 | 4.918571 |
2 | 477 | 106.79 | 5.085238 |
3 | 273 | 130.72 | 4.668571 |
4 | 550 | 151.69 | 4.893226 |
... | ... | ... | ... |
594 | 449 | 80.83 | 4.754706 |
595 | 64 | 91.70 | 3.056667 |
596 | 520 | 127.69 | 4.119032 |
597 | 55 | 84.81 | 4.463684 |
598 | 148 | 211.55 | 4.701111 |
599 rows × 3 columns
[3]:
sql = '''
SELECT
customer_id,
SUM (amount) as total_pago,
AVG (amount) as promedio_pago
FROM
payment
GROUP BY
customer_id
ORDER BY
total_pago DESC;
'''
pd.read_sql_query(sql, cnx)
[3]:
customer_id | total_pago | promedio_pago | |
---|---|---|---|
0 | 148 | 211.55 | 4.701111 |
1 | 526 | 208.58 | 4.966190 |
2 | 178 | 194.61 | 4.990000 |
3 | 137 | 191.62 | 5.042632 |
4 | 144 | 189.60 | 4.740000 |
... | ... | ... | ... |
594 | 110 | 49.88 | 4.156667 |
595 | 320 | 47.85 | 3.190000 |
596 | 248 | 37.87 | 2.913077 |
597 | 281 | 32.90 | 3.290000 |
598 | 318 | 27.93 | 3.990000 |
599 rows × 3 columns
[4]:
sql = '''
SELECT
customer_id,
SUM (amount)
FROM
payment
GROUP BY
customer_id
ORDER BY
SUM (amount) DESC;
'''
pd.read_sql_query(sql, cnx)
[4]:
customer_id | sum | |
---|---|---|
0 | 148 | 211.55 |
1 | 526 | 208.58 |
2 | 178 | 194.61 |
3 | 137 | 191.62 |
4 | 144 | 189.60 |
... | ... | ... |
594 | 110 | 49.88 |
595 | 320 | 47.85 |
596 | 248 | 37.87 |
597 | 281 | 32.90 |
598 | 318 | 27.93 |
599 rows × 2 columns
[5]:
# Obtener también el nombre (¡Cuidado con los homónimos!)
sql = '''
SELECT
first_name || ' ' || last_name nombre_completo,
SUM (amount) as total_pago,
AVG (amount) as promedio_pago
FROM
payment
INNER JOIN customer USING (customer_id)
GROUP BY nombre_completo
ORDER BY total_pago;
'''
pd.read_sql_query(sql, cnx)
[5]:
nombre_completo | total_pago | promedio_pago | |
---|---|---|---|
0 | Brian Wyman | 27.93 | 3.990000 |
1 | Leona Obrien | 32.90 | 3.290000 |
2 | Caroline Bowman | 37.87 | 2.913077 |
3 | Anthony Schwab | 47.85 | 3.190000 |
4 | Tiffany Jordan | 49.88 | 4.156667 |
... | ... | ... | ... |
594 | Clara Shaw | 189.60 | 4.740000 |
595 | Rhonda Kennedy | 191.62 | 5.042632 |
596 | Marion Snyder | 194.61 | 4.990000 |
597 | Karl Seal | 208.58 | 4.966190 |
598 | Eleanor Hunt | 211.55 | 4.701111 |
599 rows × 3 columns
[6]:
# Agrupar por Fecha
sql = '''
SELECT
DATE(payment_date) paid_date,
SUM(amount) sum
FROM
payment
GROUP BY
DATE(payment_date);
'''
pd.read_sql_query(sql, cnx)
[6]:
paid_date | sum | |
---|---|---|
0 | 2007-02-14 | 116.73 |
1 | 2007-02-19 | 1290.90 |
2 | 2007-02-20 | 1219.09 |
3 | 2007-03-19 | 2617.69 |
4 | 2007-04-26 | 347.21 |
5 | 2007-04-08 | 2227.84 |
6 | 2007-02-15 | 1188.92 |
7 | 2007-04-28 | 2622.73 |
8 | 2007-03-17 | 2442.16 |
9 | 2007-03-20 | 2669.89 |
10 | 2007-03-23 | 2342.43 |
11 | 2007-03-21 | 2868.27 |
12 | 2007-04-29 | 2717.60 |
13 | 2007-04-10 | 1973.18 |
14 | 2007-05-14 | 514.18 |
15 | 2007-04-12 | 1930.48 |
16 | 2007-03-02 | 2550.05 |
17 | 2007-04-27 | 2673.57 |
18 | 2007-03-16 | 299.28 |
19 | 2007-02-18 | 1275.98 |
20 | 2007-04-07 | 1984.28 |
21 | 2007-02-21 | 917.87 |
22 | 2007-02-16 | 1154.18 |
23 | 2007-03-18 | 2701.76 |
24 | 2007-04-06 | 2077.14 |
25 | 2007-04-30 | 5723.89 |
26 | 2007-03-01 | 2808.24 |
27 | 2007-04-11 | 1940.32 |
28 | 2007-04-09 | 2067.86 |
29 | 2007-04-05 | 273.36 |
30 | 2007-02-17 | 1188.17 |
31 | 2007-03-22 | 2586.79 |
Cláusulas HAVING¶
La cláusula HAVING especifica una condición de búsqueda para un grupo o un agregado. Se usa a menudo con la cláusula GROUP BY para filtrar grupos o agregados en función de una condición específica.
[7]:
#Los Clientes que han gastado más de 200 dólares
sql = '''
SELECT
customer_id,
SUM (amount)
FROM
payment
GROUP BY
customer_id
HAVING
SUM (amount) > 200;
'''
pd.read_sql_query(sql, cnx)
[7]:
customer_id | sum | |
---|---|---|
0 | 526 | 208.58 |
1 | 148 | 211.55 |
[8]:
#Las tiendas con más de 300 clientes
sql = '''
SELECT
store_id,
COUNT (customer_id)
FROM
customer
GROUP BY
store_id
HAVING
COUNT (customer_id) > 300;
'''
pd.read_sql_query(sql, cnx)
[8]:
store_id | count | |
---|---|---|
0 | 1 | 326 |
El operador de UNION¶
El operador UNION combina conjuntos de resultados de dos o más declaraciones SELECT en un único conjunto de resultados.
[9]:
sql = '''
DROP TABLE IF EXISTS top_rated_films;
CREATE TABLE top_rated_films(
title VARCHAR NOT NULL,
release_year SMALLINT
);
DROP TABLE IF EXISTS most_popular_films;
CREATE TABLE most_popular_films(
title VARCHAR NOT NULL,
release_year SMALLINT
);
INSERT INTO
top_rated_films(title,release_year)
VALUES
('The Shawshank Redemption',1994),
('The Godfather',1972),
('12 Angry Men',1957);
INSERT INTO
most_popular_films(title,release_year)
VALUES
('An American Pickle',2020),
('The Godfather',1972),
('Greyhound',2020);
'''
cnx.execute(sql)
[9]:
<sqlalchemy.engine.result.ResultProxy at 0x2d360e281f0>
[10]:
sql = '''
SELECT * FROM top_rated_films;
'''
pd.read_sql_query(sql, cnx)
[10]:
title | release_year | |
---|---|---|
0 | The Shawshank Redemption | 1994 |
1 | The Godfather | 1972 |
2 | 12 Angry Men | 1957 |
[11]:
sql = '''
SELECT * FROM most_popular_films;
'''
pd.read_sql_query(sql, cnx)
[11]:
title | release_year | |
---|---|---|
0 | An American Pickle | 2020 |
1 | The Godfather | 1972 |
2 | Greyhound | 2020 |
[12]:
# La unión de dos tablas sin duplicados
sql = '''
SELECT * FROM top_rated_films
UNION
SELECT * FROM most_popular_films;
'''
pd.read_sql_query(sql, cnx)
[12]:
title | release_year | |
---|---|---|
0 | An American Pickle | 2020 |
1 | Greyhound | 2020 |
2 | The Shawshank Redemption | 1994 |
3 | The Godfather | 1972 |
4 | 12 Angry Men | 1957 |
[13]:
# La unión de dos tablas con duplicados
sql = '''
SELECT * FROM top_rated_films
UNION ALL
SELECT * FROM most_popular_films;
'''
pd.read_sql_query(sql, cnx)
[13]:
title | release_year | |
---|---|---|
0 | The Shawshank Redemption | 1994 |
1 | The Godfather | 1972 |
2 | 12 Angry Men | 1957 |
3 | An American Pickle | 2020 |
4 | The Godfather | 1972 |
5 | Greyhound | 2020 |
[14]:
# La unión de dos tablas con duplicados
sql = '''
SELECT * FROM top_rated_films
UNION ALL
SELECT * FROM most_popular_films
ORDER BY title;
'''
pd.read_sql_query(sql, cnx)
[14]:
title | release_year | |
---|---|---|
0 | 12 Angry Men | 1957 |
1 | An American Pickle | 2020 |
2 | Greyhound | 2020 |
3 | The Godfather | 1972 |
4 | The Godfather | 1972 |
5 | The Shawshank Redemption | 1994 |
Operador INTERSECT¶
Devuelve las filas que están disponibles en ambos conjuntos de resultados. Ambos conjuntos DEBEN tener la misma cantidad de columnas y tipo.
[15]:
# La unión de dos tablas con duplicados
sql = '''
SELECT *
FROM most_popular_films
INTERSECT
SELECT *
FROM top_rated_films;
'''
pd.read_sql_query(sql, cnx)
[15]:
title | release_year | |
---|---|---|
0 | The Godfather | 1972 |
Operador EXCEPT¶
Devuelve filas distintas de la primera consulta A (izquierda) que no están en la salida de la segunda consulta B (derecha).
Except corresponde al área sombreada. Fuente
[16]:
# La unión de dos tablas con duplicados
sql = '''
SELECT * FROM top_rated_films
EXCEPT
SELECT * FROM most_popular_films;
'''
pd.read_sql_query(sql, cnx)
[16]:
title | release_year | |
---|---|---|
0 | The Shawshank Redemption | 1994 |
1 | 12 Angry Men | 1957 |
[17]:
# Eliminar las tablas del ejemplo
sql = '''
DROP TABLE IF EXISTS top_rated_films;
DROP TABLE IF EXISTS most_popular_films;
'''
cnx.execute(sql)
[17]:
<sqlalchemy.engine.result.ResultProxy at 0x2d35db590a0>
El operador ANY / SOME¶
Permite comparar un valor con un conjunto de valores devueltos por una subconsulta.
El operador ANY debe estar precedido por uno de los siguientes operadores de comparación =, <=,>, <,> y <>
El operador ANY devuelve verdadero si cualquier valor de la subconsulta cumple la condición; de lo contrario, devuelve falso.
Tenga en cuenta que SOME es sinónimo de ANY, puede sustituir SOME por ANY cualquier sentencia SQL.
[18]:
# Las películas cuya duración es mayor o igual que
# la duración máxima de cualquier categoría de película.
sql = '''
SELECT title
FROM film
WHERE length >= ANY(
SELECT MAX( length )
FROM film
INNER JOIN film_category USING(film_id)
GROUP BY category_id );
'''
pd.read_sql_query(sql, cnx)
[18]:
title | |
---|---|
0 | Alley Evolution |
1 | Analyze Hoosiers |
2 | Anonymous Human |
3 | Baked Cleopatra |
4 | Casualties Encino |
... | ... |
64 | Wild Apollo |
65 | Worst Banger |
66 | Wrong Behavior |
67 | Young Language |
68 | Youth Kick |
69 rows × 1 columns
Igual ANY ( = ANY) es equivalente al operador IN
[19]:
# obtiener las películas cuya categoría es Actiono Drama.
sql = '''
SELECT
title,
category_id
FROM
film
INNER JOIN film_category
USING(film_id)
WHERE
category_id = ANY(
SELECT
category_id
FROM
category
WHERE
NAME = 'Action'
OR NAME = 'Drama'
);
'''
pd.read_sql_query(sql, cnx)
[19]:
title | category_id | |
---|---|---|
0 | Amadeus Holy | 1 |
1 | American Circus | 1 |
2 | Antitrust Tomatoes | 1 |
3 | Apollo Teen | 7 |
4 | Ark Ridgemont | 1 |
... | ... | ... |
121 | Werewolf Lola | 1 |
122 | West Lion | 7 |
123 | Witches Panic | 7 |
124 | Women Dorado | 1 |
125 | Worst Banger | 1 |
126 rows × 2 columns
[20]:
# Consulta IN equivalente
sql = '''
SELECT
title,
category_id
FROM
film
INNER JOIN film_category
USING(film_id)
WHERE
category_id IN (
SELECT
category_id
FROM
category
WHERE
NAME = 'Action'
OR NAME = 'Drama'
);
'''
pd.read_sql_query(sql, cnx)
[20]:
title | category_id | |
---|---|---|
0 | Amadeus Holy | 1 |
1 | American Circus | 1 |
2 | Antitrust Tomatoes | 1 |
3 | Apollo Teen | 7 |
4 | Ark Ridgemont | 1 |
... | ... | ... |
121 | Werewolf Lola | 1 |
122 | West Lion | 7 |
123 | Witches Panic | 7 |
124 | Women Dorado | 1 |
125 | Worst Banger | 1 |
126 rows × 2 columns