Notes taken by Abd @ITNA Digital
Links
🔗 Link to the Video
Keywords
Jupyter Notebook, Argparse, Python to Terminal Interaction, pgAdmin, DROP TABLE, http.server, ipconfig, localhost vs IPv4 address,
Table of Contents
You can use Jupyter to convert an existing notebook to a script with the following command -
jupyter nbconvert --to=script upload-data.ipynb
upload-data.ipynb
was the name of the notebook we created. The script name will be upload-data.py.
After the script has been created edit it and remove the unnecessary parts. We have renamed the script to ingest_data.py. (It actually takes less time to copy paste cells of data to a script then converting it!)
ingest_data.py
from time import time
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')
df_iter = read_csv('yellow_tripdata_2021-01.csv', iterator=True, chunksize=100000)
df = next(df_iter)
tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
# adding the column names
df.head(n=0).to_sql(name="yellow_taxi_data", con=engine, if_exists="replace")
# adding the first batch of rows
df.to_sql(name="yellow_taxi_data", con=engine, if_exists="append")
while True:
t_start = time()
df = next(df_iter)
tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
df.to_sql(name="yellow_taxi_data", con=engine, if_exists="append")
t_start = time()
print('Inserted another chunk... took %.3f second(s)' % (t_end - t_start))
argparse
to parse command line argumentsWe are using argparse
standard library which helps us to create parse command line arguments instead of sys.argv
.
from time import time
from sqlalchemy import create_engine
import pandas as pd
import argparse
import os
def main(params):
user = params.user
password = params.password
host = params.host
port = params.port
db = params.db
table_name = params.table_name
url = params.url
csv_name = 'output.csv'
# download the csv
# os system function can run command line arguments from Python
os.system(f"wget {url} -O {csv_name}")
engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')
df_iter = pd.read_csv(csv_name, iterator=True, chunksize=100000)
df = next(df_iter)
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
# adding the column names
df.head(n=0).to_sql(name=table_name, con=engine, if_exists="replace")
# adding the first batch of rows
df.to_sql(name=table_name, con=engine, if_exists="append")
while True:
t_start = time()
df = next(df_iter)
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
df.to_sql(name=table_name, con=engine, if_exists="append")
t_end = time()
print('Inserted another chunk... took %.3f second(s)' % (t_end - t_start))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Ingest CSV data to Postgres")
# user
# password
# host
# port
# database name
# table name
# url of the csv
parser.add_argument('--user', help="user name for postgres")
parser.add_argument('--password', help="password for postgres")
parser.add_argument('--host', help="host for postgres")
parser.add_argument('--port', help="port for postgres")
parser.add_argument('--db', help="database name for postgres")
parser.add_argument('--table_name', help="name of the table where we will write the results to")
parser.add_argument('--url', help="url of the CSV")
args = parser.parse_args()
# xprint(args.accumulate(args.integers))
main(args)
os.system
function allows you to run terminal commands from inside a Python Script. The -O
argument outputs the file we have wget
ed to the "output.csv" file.