Consultas de Selección: Group By, Having, Union, Intersect, Except, Any¶

Conexión a la Base de Datos¶

[1]:

import pandas as pd
import psycopg2
import sqlalchemy
import matplotlib as plt

%matplotlib inline

from sqlalchemy import create_engine

POSTGRES_ADDRESS = 'localhost' # Este es el servidor, puede ser una IP
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'postgres'
POSTGRES_PASSWORD = '1234'
POSTGRES_DBNAME = 'dvdrental'

# Ahora se configura la cadena de conexión.
# Esta es una cadena que contiene los parámetros necesarios para establecer una
# conexión con posgres.
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'.format(username=POSTGRES_USERNAME,
                                                                                        password=POSTGRES_PASSWORD,
                                                                                        ipaddress=POSTGRES_ADDRESS,
                                                                                        port=POSTGRES_PORT,
                                                                                        dbname=POSTGRES_DBNAME))
# A continuación se crea la conexión
cnx = create_engine(postgres_str)

La cláusula Group By¶

La cláusula GROUP BY divide las filas devueltas de la declaración SELECT en grupos. Para cada grupo, puede aplicar una función agregada, por ejemplo, SUM() para calcular la suma de elementos o COUNT() para obtener el número de elementos en los grupos.

[2]:

sql = '''
SELECT
    customer_id,
    SUM (amount) as total_pago,
    AVG (amount) as promedio_pago
FROM
    payment
GROUP BY
    customer_id;
'''

pd.read_sql_query(sql, cnx)

[2]:

	customer_id	total_pago	promedio_pago
0	184	80.80	4.040000
1	87	137.72	4.918571
2	477	106.79	5.085238
3	273	130.72	4.668571
4	550	151.69	4.893226
...	...	...	...
594	449	80.83	4.754706
595	64	91.70	3.056667
596	520	127.69	4.119032
597	55	84.81	4.463684
598	148	211.55	4.701111

599 rows × 3 columns

[3]:

sql = '''
SELECT
    customer_id,
    SUM (amount) as total_pago,
    AVG (amount) as promedio_pago
FROM
    payment
GROUP BY
    customer_id
ORDER BY
    total_pago DESC;
'''

pd.read_sql_query(sql, cnx)

[3]:

	customer_id	total_pago	promedio_pago
0	148	211.55	4.701111
1	526	208.58	4.966190
2	178	194.61	4.990000
3	137	191.62	5.042632
4	144	189.60	4.740000
...	...	...	...
594	110	49.88	4.156667
595	320	47.85	3.190000
596	248	37.87	2.913077
597	281	32.90	3.290000
598	318	27.93	3.990000

599 rows × 3 columns

[4]:

sql = '''
SELECT
    customer_id,
    SUM (amount)
FROM
    payment
GROUP BY
    customer_id
ORDER BY
    SUM (amount) DESC;
'''

pd.read_sql_query(sql, cnx)

[4]:

	customer_id	sum
0	148	211.55
1	526	208.58
2	178	194.61
3	137	191.62
4	144	189.60
...	...	...
594	110	49.88
595	320	47.85
596	248	37.87
597	281	32.90
598	318	27.93

599 rows × 2 columns

[5]:

# Obtener también el nombre (¡Cuidado con los homónimos!)
sql = '''
SELECT
    first_name || ' ' || last_name nombre_completo,
    SUM (amount) as total_pago,
    AVG (amount) as promedio_pago
FROM
    payment
INNER JOIN customer USING (customer_id)
GROUP BY nombre_completo
ORDER BY total_pago;
'''

pd.read_sql_query(sql, cnx)

[5]:

	nombre_completo	total_pago	promedio_pago
0	Brian Wyman	27.93	3.990000
1	Leona Obrien	32.90	3.290000
2	Caroline Bowman	37.87	2.913077
3	Anthony Schwab	47.85	3.190000
4	Tiffany Jordan	49.88	4.156667
...	...	...	...
594	Clara Shaw	189.60	4.740000
595	Rhonda Kennedy	191.62	5.042632
596	Marion Snyder	194.61	4.990000
597	Karl Seal	208.58	4.966190
598	Eleanor Hunt	211.55	4.701111

599 rows × 3 columns

[6]:

# Agrupar por Fecha
sql = '''
SELECT
    DATE(payment_date) paid_date,
    SUM(amount) sum
FROM
    payment
GROUP BY
    DATE(payment_date);
'''

pd.read_sql_query(sql, cnx)

[6]:

	paid_date	sum
0	2007-02-14	116.73
1	2007-02-19	1290.90
2	2007-02-20	1219.09
3	2007-03-19	2617.69
4	2007-04-26	347.21
5	2007-04-08	2227.84
6	2007-02-15	1188.92
7	2007-04-28	2622.73
8	2007-03-17	2442.16
9	2007-03-20	2669.89
10	2007-03-23	2342.43
11	2007-03-21	2868.27
12	2007-04-29	2717.60
13	2007-04-10	1973.18
14	2007-05-14	514.18
15	2007-04-12	1930.48
16	2007-03-02	2550.05
17	2007-04-27	2673.57
18	2007-03-16	299.28
19	2007-02-18	1275.98
20	2007-04-07	1984.28
21	2007-02-21	917.87
22	2007-02-16	1154.18
23	2007-03-18	2701.76
24	2007-04-06	2077.14
25	2007-04-30	5723.89
26	2007-03-01	2808.24
27	2007-04-11	1940.32
28	2007-04-09	2067.86
29	2007-04-05	273.36
30	2007-02-17	1188.17
31	2007-03-22	2586.79

Cláusulas HAVING¶

La cláusula HAVING especifica una condición de búsqueda para un grupo o un agregado. Se usa a menudo con la cláusula GROUP BY para filtrar grupos o agregados en función de una condición específica.

[7]:

#Los Clientes que han gastado más de 200 dólares
sql = '''
SELECT
    customer_id,
    SUM (amount)
FROM
    payment
GROUP BY
    customer_id
HAVING
    SUM (amount) > 200;
'''

pd.read_sql_query(sql, cnx)

[7]:

	customer_id	sum
0	526	208.58
1	148	211.55

[8]:

#Las tiendas con más de 300 clientes
sql = '''
SELECT
    store_id,
    COUNT (customer_id)
FROM
    customer
GROUP BY
    store_id
HAVING
    COUNT (customer_id) > 300;
'''

pd.read_sql_query(sql, cnx)

[8]:

	store_id	count
0	1	326

El operador de UNION¶

El operador UNION combina conjuntos de resultados de dos o más declaraciones SELECT en un único conjunto de resultados.

[9]:

sql = '''
DROP TABLE IF EXISTS top_rated_films;
CREATE TABLE top_rated_films(
    title VARCHAR NOT NULL,
    release_year SMALLINT
);

DROP TABLE IF EXISTS most_popular_films;
CREATE TABLE most_popular_films(
    title VARCHAR NOT NULL,
    release_year SMALLINT
);

INSERT INTO
   top_rated_films(title,release_year)
VALUES
   ('The Shawshank Redemption',1994),
   ('The Godfather',1972),
   ('12 Angry Men',1957);

INSERT INTO
   most_popular_films(title,release_year)
VALUES
   ('An American Pickle',2020),
   ('The Godfather',1972),
   ('Greyhound',2020);
'''

cnx.execute(sql)

[9]:

<sqlalchemy.engine.result.ResultProxy at 0x2d360e281f0>

[10]:

sql = '''
SELECT * FROM top_rated_films;
'''

pd.read_sql_query(sql, cnx)

[10]:

	title	release_year
0	The Shawshank Redemption	1994
1	The Godfather	1972
2	12 Angry Men	1957

[11]:

sql = '''
SELECT * FROM most_popular_films;
'''

pd.read_sql_query(sql, cnx)

[11]:

	title	release_year
0	An American Pickle	2020
1	The Godfather	1972
2	Greyhound	2020

[12]:

# La unión de dos tablas sin duplicados
sql = '''
SELECT * FROM top_rated_films
UNION
SELECT * FROM most_popular_films;
'''

pd.read_sql_query(sql, cnx)

[12]:

	title	release_year
0	An American Pickle	2020
1	Greyhound	2020
2	The Shawshank Redemption	1994
3	The Godfather	1972
4	12 Angry Men	1957

[13]:

# La unión de dos tablas con duplicados
sql = '''
SELECT * FROM top_rated_films
UNION ALL
SELECT * FROM most_popular_films;
'''

pd.read_sql_query(sql, cnx)

[13]:

	title	release_year
0	The Shawshank Redemption	1994
1	The Godfather	1972
2	12 Angry Men	1957
3	An American Pickle	2020
4	The Godfather	1972
5	Greyhound	2020

[14]:

# La unión de dos tablas con duplicados
sql = '''
SELECT * FROM top_rated_films
UNION ALL
SELECT * FROM most_popular_films
ORDER BY title;
'''

pd.read_sql_query(sql, cnx)

[14]:

	title	release_year
0	12 Angry Men	1957
1	An American Pickle	2020
2	Greyhound	2020
3	The Godfather	1972
4	The Godfather	1972
5	The Shawshank Redemption	1994

Operador INTERSECT¶

Devuelve las filas que están disponibles en ambos conjuntos de resultados. Ambos conjuntos DEBEN tener la misma cantidad de columnas y tipo.

[15]:

# La unión de dos tablas con duplicados
sql = '''
SELECT *
FROM most_popular_films
INTERSECT
SELECT *
FROM top_rated_films;
'''

pd.read_sql_query(sql, cnx)

[15]:

	title	release_year
0	The Godfather	1972

Operador EXCEPT¶

Devuelve filas distintas de la primera consulta A (izquierda) que no están en la salida de la segunda consulta B (derecha).

Except corresponde al área sombreada. Fuente

Venn

[16]:

# La unión de dos tablas con duplicados
sql = '''
SELECT * FROM top_rated_films
EXCEPT
SELECT * FROM most_popular_films;
'''

pd.read_sql_query(sql, cnx)

[16]:

	title	release_year
0	The Shawshank Redemption	1994
1	12 Angry Men	1957

[17]:

# Eliminar las tablas del ejemplo
sql = '''
DROP TABLE IF EXISTS top_rated_films;
DROP TABLE IF EXISTS most_popular_films;
'''

cnx.execute(sql)

[17]:

<sqlalchemy.engine.result.ResultProxy at 0x2d35db590a0>

El operador ANY / SOME¶

Permite comparar un valor con un conjunto de valores devueltos por una subconsulta.

El operador ANY debe estar precedido por uno de los siguientes operadores de comparación =, <=,>, <,> y <>
El operador ANY devuelve verdadero si cualquier valor de la subconsulta cumple la condición; de lo contrario, devuelve falso.
Tenga en cuenta que SOME es sinónimo de ANY, puede sustituir SOME por ANY cualquier sentencia SQL.

[18]:

# Las películas cuya duración es mayor o igual que
# la duración máxima de cualquier categoría de película.

sql = '''
SELECT title
FROM film
WHERE length >= ANY(
    SELECT MAX( length )
    FROM film
    INNER JOIN film_category USING(film_id)
    GROUP BY  category_id );
'''

pd.read_sql_query(sql, cnx)

[18]:

	title
0	Alley Evolution
1	Analyze Hoosiers
2	Anonymous Human
3	Baked Cleopatra
4	Casualties Encino
...	...
64	Wild Apollo
65	Worst Banger
66	Wrong Behavior
67	Young Language
68	Youth Kick

69 rows × 1 columns

Igual ANY ( = ANY) es equivalente al operador IN

[19]:

# obtiener las películas cuya categoría es Actiono Drama.

sql = '''
SELECT
    title,
    category_id
FROM
    film
    INNER JOIN film_category
        USING(film_id)
WHERE
    category_id = ANY(
        SELECT
            category_id
        FROM
            category
        WHERE
            NAME = 'Action'
            OR NAME = 'Drama'
    );
'''

pd.read_sql_query(sql, cnx)

[19]:

	title	category_id
0	Amadeus Holy	1
1	American Circus	1
2	Antitrust Tomatoes	1
3	Apollo Teen	7
4	Ark Ridgemont	1
...	...	...
121	Werewolf Lola	1
122	West Lion	7
123	Witches Panic	7
124	Women Dorado	1
125	Worst Banger	1

126 rows × 2 columns

[20]:

# Consulta IN equivalente

sql = '''
SELECT
    title,
    category_id
FROM
    film
INNER JOIN film_category
        USING(film_id)
WHERE
    category_id IN (
        SELECT
            category_id
        FROM
            category
        WHERE
            NAME = 'Action'
            OR NAME = 'Drama'
    );
'''

pd.read_sql_query(sql, cnx)

[20]:

	title	category_id
0	Amadeus Holy	1
1	American Circus	1
2	Antitrust Tomatoes	1
3	Apollo Teen	7
4	Ark Ridgemont	1
...	...	...
121	Werewolf Lola	1
122	West Lion	7
123	Witches Panic	7
124	Women Dorado	1
125	Worst Banger	1

126 rows × 2 columns