Alapvető műveletek elvégzése
SQL nyelv |
PANDAS (python csomag) |
megjelenítési kiválasztás |
|
SELECT col1 AS col2, col3 AS col4 |
df.columns = ['col2', 'col4'] |
SELECT DISTINCT col1 |
np.unique(branches_new['bank_name']) |
CAST(col1 AS DATE) AS col2 |
pd.to_datetime(df['col1']) |
CAST(col1 AS DECIMAL(15,4)) |
df['col1'] = df['col1'].round(4) |
SELECT TOP 10 ... |
.head(10) |
Agregáció, összegzés |
|
SELECT col1, col2,COUNT(*) … group
by .. Order by .. |
df.groupby(['col1','col2']).agg({'col3':np.size}) |
SELECT day, AVG(tip), COUNT(*) |
tips.groupby('day').agg({'tip': np.mean, 'day': np.size}) |
SELECT smoker, day, COUNT(*), AVG(tip) |
tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]}) |
SELECT col1, col2,ROW_NUMBER() OVER (PARTITION BY col1, col2 ORDER BY
col3 ASC) AS rn |
df = df.assign(rn=df.sort_values(['col3'],
ascending=True).groupby(['col1','col2']).cumcount() + 1) |
SELECT * FROM ( |
tips.assign(rn=tips.sort_values(['total_bill'], ascending=False) |
SELECT |
.groupby(['day']) |
t.*, |
.cumcount() + 1) |
ROW_NUMBER() |
.query('rn < 3') |
OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn |
.sort_values(['day','rn']) |
FROM tips t |
) |
) |
|
WHERE rn < 3 |
|
ORDER BY day, rn; |
|
JOINING |
|
|
|
SELECT A |
A.merge(B, left_on=['col1','col3'], right_on=['col2','col4'], how='left') |
LEFT JOIN B |
|
ON A.col1 = B.col2 |
|
AND A.col3 = B.col4 |
|
|
|
UNION ALL |
pd.concat([df1, df2]) |
|
|
UNION |
pd.concat([df1, df2]).drop_duplicates() |
Szűrés |
|
|
|
WHERE col1 > 15 AND col2 = ‘ABC’ |
df[ (df['col1'] > 15) & (df['col2']=='ABC') ] |
|
|
WHERE col1 = 1 OR col2 IS NULL |
df[ (df[col1]==1) | (df['col2'].isnull()) ] |
|
|
WHERE col1 LIKE ‘%value%’ |
df['col1'].str.contains('value') |
|
|
|
.match(‘value’) |
|
|
|
|
|
|
|
|
|
|
|
|
WHERE col1 IN (‘A’, ‘B’) |
|
Rendezés |
|
|
|
ORDER BY col1, col2 DESC |
.sort_values(['col1', 'col2'], ascending=[True, False]) |
|
|
hasznos kiterjesztések |
|
|
|
IS NULL |
.isnull() |
|
|
|
|
|
|
IS NOT NULL |
.notnull() |
|
|
BETWEEN ‘2009-11-01’ AND ’2009-11-30’ |
|
|
|
DATEDIFF() |
datetime64[ns] - datetime64[ns] |
|
|
(DATE1 – DATE2) |
|
|
|
DATEADD() |
|
|
|
|
|
|
|
ADD_MONTHS() |
|
|
|
CURRENT_DATE |
import time |
|
time.strftime("%d/%m/%Y") |
getdate() |
|
|
|
SELECT INDEX(‘ABCD’,’CD’) IN |
|
|
|
POSITION |
|
|
|
DENSE_RANK |
|
|
|
CONVERT(date->string) |
|
|
|
SHOW TABLE ... |
|
|
|
|
|
Adat módosítás |
|
UPDATE tips |
|
SET tip = tip*2 |
tips.loc[tips['tip'] < 2, 'tip'] *= 2 |
WHERE tip < 2; |
|
|
tips = tips.loc[tips['tip'] <= 9] |
DELETE FROM tips |
|
WHERE tip > 9; |
|
SELECT col1 || ', ' || col2 |
df[[col1', 'col2']].apply(lambda x: ', '.join(x), axis=1) |
--- lekérdezés / sűrés
df.shape
head(df) df.head()
df.iloc[:9]
--
df.query("a <= b")
df[df["a"] <= df["b"]]
df.loc[df["a"] <= df["b"]]
--
df.query('col1 == 1 & col2 == 1')
df[(df.col1 == 1) & (df.col2 == 1)]
df[['col1', 'col2']]
df.loc[:, 'col1':'col3']
df.drop(cols_to_drop, axis=1) but see1
df[['col1']].drop_duplicates()
df[['col1', 'col2']].drop_duplicates()
df.sample(n=10)
df.sample(frac=0.01)
df[(df["time"] == "Dinner") & (df["tip"] > 5.00)] ## szűrés and -al
df[(df["size"] >= 5) | (df["total_bill"] > 45)] ## szűrés or-al
-
df[df["col2"].isna()] ## is null
df[df["col1"].notna()] ## is not null
--- join
pd.merge(df1, df2, on="key") ## alapértelmezett inner join összes mezőre
pd.merge(df1, df2, on="key", how="left") ## left join összes mezőre // left, outer, right, inner
--- union /union all
pd.concat([df1, df2]) ## union all : mezők sorrend tipusának azonosaknek kell lennie
pd.concat([df1, df2]).drop_duplicates() ## union : mezők sorrend tipusának azonosaknek kell lennie
--- limitációk
df.head() ## első 5 sor
df.tail() ## utolsó 5 sor
df["time"].str.len() ## mezőhossz vizsgálat
df["time"].str.rstrip().str.len() ## mezőhossz vizsgálat (space mentesítéssel) 2
--- rendezés
df.sort_values(['col1', 'col2'])
df.sort_values('col1', ascending=False)
-
df.nlargest(10 + 5, columns="tip").tail(10)
--- transformáció
df.rename(columns={'col1': 'col_one'})['col_one']
df.rename(columns={'col1': 'col_one'})
df.assign(c=df['a']-df['b'])
df.assign(tip_rate=df["tip"] / df["total_bill"])
-
df["bucket"] = np.where(df["total_bill"] < 10, "low", "high") ## új oszlopba számolt értékü mező
-
df.drop("sex", axis=1) ## oszlop törlése
df.rename(columns={"total_bill": "total_bill_2"}) ## oszlop átnevezése
-
df.eval("a + b")
df["a"] + df["b"]
---- csoportosítás / összegzés
df.describe()
gdf = df.groupby('col1')
df.groupby('col1').agg({'col1': 'mean'})
df.groupby('col1').sum()
-
g = df.groupby(["by1", "by2"]) # step 1
g[["v1", "v2"]].mean() # step 2
-
grouped = df.groupby(["month", "week"]) # step 1
In [27]: grouped["x"].agg([np.mean, np.std]) # step 2
-
df.groupby("sex").size() ## count sql megfelelője
df.groupby("sex").count() ## összes nem null mezőre cépez count-ot
df.groupby("sex")["total_bill"].count() ## csak kijelölt mezőn képez countot
df.groupby("day").agg({"tip": np.mean, "day": np.size}) ## SELECT day, AVG(tip), COUNT(*)
------ speciális sql-nek megfelelő pandas funkció
-- ///Oracle's ROW_NUMBER() analytic function
SELECT * FROM (
SELECT
t.*,
ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn
FROM tips t
)
WHERE rn < 3
ORDER BY day, rn;
----
(df.assign(
rn=df.sort_values(["total_bill"], ascending=False)
.groupby(["day"])
.cumcount() + 1)
.query("rn < 3")
.sort_values(["day", "rn"])
)
-
(df.assign(
rnk=df.groupby(["day"])["total_bill"].rank(
method="first", ascending=False
))
.query("rnk < 3")
.sort_values(["day", "rnk"])
)
----
-- /// Oracle's RANK() analytic function
SELECT * FROM (
SELECT
t.*,
RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk
FROM tips t
WHERE tip < 2
)
WHERE rnk < 3
ORDER BY sex, rnk;
---
(df[df["tip"] < 2]
.assign(rnk_min=df.groupby(["sex"])["tip"].rank(method="min"))
.query("rnk_min < 3")
.sort_values(["sex", "rnk_min"])
)
----- Update parancs
UPDATE tips
SET tip = tip*2
WHERE tip < 2;
--
tips.loc[tips["tip"] < 2, "tip"] *= 2
------ törlés oarancs
DELETE FROM tips
WHERE tip > 9;
--
df = df.loc[df["tip"] <= 9]
Megjegyzések
Megjegyzés küldése