Insérer en bloc un DataFrame Pandas à l'aide de SQLAlchemy

Question

J'ai quelques DataFrames de pandas assez volumineux et j'aimerais utiliser les nouveaux mappages SQL en bloc pour les télécharger sur un serveur Microsoft SQL via SQL Alchemy. La méthode pandas.to_sql, bien que Nice, est lente.

J'ai du mal à écrire le code ...

J'aimerais pouvoir transmettre à cette fonction un DataFrame de pandas que j'appelle table, un nom de schéma que j'appelle schema et un nom de table que j'appelle name. Idéalement, la fonction 1.) supprimera la table si elle existe déjà. 2.) créer une nouvelle table 3.) créer un mappeur et 4.) insérer en bloc en utilisant les données du mappeur et des pandas. Je suis coincé sur la partie 3.

Voici mon code (certes approximatif). Je ne parviens pas à faire en sorte que la fonction de mappage fonctionne avec mes clés primaires. Je n'ai pas vraiment besoin de clés primaires, mais la fonction de mappeur l'exige.

Merci pour les idées.

from sqlalchemy import create_engine Table, Column, MetaData from sqlalchemy.orm import mapper, create_session from sqlalchemy.ext.declarative import declarative_base from pandas.io.sql import SQLTable, SQLDatabase def bulk_upload(table, schema, name): e = create_engine('mssql+pyodbc://MYDB') s = create_session(bind=e) m = MetaData(bind=e,reflect=True,schema=schema) Base = declarative_base(bind=e,metadata=m) t = Table(name,m) m.remove(t) t.drop(checkfirst=True) sqld = SQLDatabase(e, schema=schema,meta=m) sqlt = SQLTable(name, sqld, table).table sqlt.metadata = m m.create_all(bind=e,tables=[sqlt]) class MyClass(Base): return mapper(MyClass, sqlt) s.bulk_insert_mappings(MyClass, table.to_dict(orient='records')) return

ansonw · Answer

J'ai rencontré un problème similaire avec pd.to_sql prenant des heures pour télécharger des données. Le code ci-dessous a inséré les mêmes données en quelques secondes.

from sqlalchemy import create_engine import psycopg2 as pg #load python script that batch loads pandas df to sql import cStringIO address = 'postgresql://<username>:<pswd>@<Host>:<port>/<database>' engine = create_engine(address) connection = engine.raw_connection() cursor = connection.cursor() #df is the dataframe containing an index and the columns "Event" and "Day" #create Index column to use as primary key df.reset_index(inplace=True) df.rename(columns={'index':'Index'}, inplace =True) #create the table but first drop if it already exists command = '''DROP TABLE IF EXISTS localytics_app2; CREATE TABLE localytics_app2 ( "Index" serial primary key, "Event" text, "Day" timestamp without time zone, );''' cursor.execute(command) connection.commit() #stream the data using 'to_csv' and StringIO(); then use sql's 'copy_from' function output = cStringIO.StringIO() #ignore the index df.to_csv(output, sep='	', header=False, index=False) #jump to start of stream output.seek(0) contents = output.getvalue() cur = connection.cursor() #null values become '' cur.copy_from(output, 'localytics_app2', null="") connection.commit() cur.close()

AkaGonjo · Answer

On aurait peut-être déjà répondu à cette question, mais j'ai trouvé la solution en regroupant différentes réponses sur ce site et en m'alignant sur la documentation de SQLAlchemy.

La table doit déjà exister dans db1; avec un index mis en place avec auto_increment sur.
La classe Actuel doit être aligné sur le cadre de données importé dans le fichier CSV et le tableau dans la base de données db1.

J'espère que cela aidera quiconque viendra ici et voudra mélanger rapidement Panda et SQLAlchemy.

from urllib import quote_plus as urlquote import sqlalchemy from sqlalchemy import create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, Integer, String, Numeric from sqlalchemy.orm import sessionmaker import pandas as pd # Set up of the engine to connect to the database # the urlquote is used for passing the password which might contain special characters such as "/" engine = create_engine('mysql://root:%s@localhost/db1' % urlquote('weirdPassword*withsp€cialcharacters'), echo=False) conn = engine.connect() Base = declarative_base() #Declaration of the class in order to write into the database. This structure is standard and should align with SQLAlchemy's doc. class Current(Base): __table= 'tableName' id = Column(Integer, primary_key=True) Date = Column(String(500)) Type = Column(String(500)) Value = Column(Numeric()) def __repr__(self): return "(id='%s', Date='%s', Type='%s', Value='%s')" % (self.id, self.Date, self.Type, self.Value) # Set up of the table in db and the file to import fileToRead = 'file.csv' tableToWriteTo = 'tableName' # Panda to create a lovely dataframe df_to_be_written = pd.read_csv(fileToRead) # The orient='records' is the key of this, it allows to align with the format mentioned in the doc to insert in bulks. listToWrite = df_to_be_written.to_dict(orient='records') metadata = sqlalchemy.schema.MetaData(bind=engine,reflect=True) table = sqlalchemy.Table(tableToWriteTo, metadata, autoload=True) # Open the session Session = sessionmaker(bind=engine) session = Session() # Inser the dataframe into the database in one bulk conn.execute(table.insert(), listToWrite) # Commit the changes session.commit() # Close the session session.close()

Fabien Vauchelles · Answer

Basé sur les réponses de @ansonw:

def to_sql(engine, df, table, if_exists='fail', sep='	', encoding='utf8'): # Create Table df[:0].to_sql(table, engine, if_exists=if_exists) # Prepare data output = cStringIO.StringIO() df.to_csv(output, sep=sep, header=False, encoding=encoding) output.seek(0) # Insert data connection = engine.raw_connection() cursor = connection.cursor() cursor.copy_from(output, table, sep=sep, null='') connection.commit() cursor.close()

J'insère 200 000 lignes en 5 secondes au lieu de 4 minutes

mgoldwasser · Answer

Ma solution spécifique postgres ci-dessous crée automatiquement la table de base de données à l'aide de votre base de données pandas et effectue une insertion rapide en bloc à l'aide de postgres COPY my_table FROM ...

import io import pandas as pd from sqlalchemy import create_engine def write_to_table(df, db_engine, schema, table_name, if_exists='fail'): string_data_io = io.StringIO() df.to_csv(string_data_io, sep='|', index=False) pd_sql_engine = pd.io.sql.pandasSQL_builder(db_engine, schema=schema) table = pd.io.sql.SQLTable(table_name, pd_sql_engine, frame=df, index=False, if_exists=if_exists, schema=schema) table.create() string_data_io.seek(0) string_data_io.readline() # remove header with db_engine.connect() as connection: with connection.connection.cursor() as cursor: copy_cmd = "COPY %s.%s FROM STDIN HEADER DELIMITER '|' CSV" % (schema, table_name) cursor.copy_expert(copy_cmd, string_data_io) connection.connection.commit()

dgorissen · Answer

S'agissant d'une charge de travail E/S lourde, vous pouvez également utiliser le module de threading Python via multiprocessing.dummy . Cela a accéléré les choses pour moi:

import math from multiprocessing.dummy import Pool as ThreadPool ... def insert_df(df, *args, **kwargs): nworkers = 4 chunksize = math.floor(df.shape[0] / nworkers) chunks = [(chunksize * i, (chunksize * i) + chunksize) for i in range(nworkers)] chunks.append((chunksize * nworkers, df.shape[0])) pool = ThreadPool(nworkers) def worker(chunk): i, j = chunk df.iloc[i:j, :].to_sql(*args, **kwargs) pool.map(worker, chunks) pool.close() pool.join() .... insert_df(df, "foo_bar", engine, if_exists='append')

bootstrap · Answer

Cela a fonctionné pour moi de me connecter à Oracle Database en utilisant cx_Oracle et SQLALchemy

import sqlalchemy import cx_Oracle from sqlalchemy import create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, String from sqlalchemy.orm import sessionmaker import pandas as pd # credentials username = "username" password = "password" connectStr = "connection:/string" tableName = "tablename" t0 = time.time() # connection dsn = cx_Oracle.makedsn('Host','port',service_name='servicename') Base = declarative_base() class LANDMANMINERAL(Base): __table= 'tablename' DOCUMENTNUM = Column(String(500), primary_key=True) DOCUMENTTYPE = Column(String(500)) FILENUM = Column(String(500)) LEASEPAYOR = Column(String(500)) LEASESTATUS = Column(String(500)) PROSPECT = Column(String(500)) SPLIT = Column(String(500)) SPLITSTATUS = Column(String(500)) engine = create_engine('Oracle+cx_Oracle://%s:%s@%s' % (username, password, dsn)) conn = engine.connect() Base.metadata.bind = engine # Creating the session DBSession = sessionmaker(bind=engine) session = DBSession() # Bulk insertion data = pd.read_csv('data.csv') lists = data.to_dict(orient='records') table = sqlalchemy.Table('landmanmineral', Base.metadata, autoreload=True) conn.execute(table.insert(), lists) session.commit() session.close() print("time taken %8.8f seconds" % (time.time() - t0) )

freddy888 · Answer

pour les personnes comme moi qui tentent de mettre en œuvre les solutions susmentionnées:

Pandas 0.24.0 a maintenant to_sql avec chunksize et method = 'multi' option qui insère en bloc ...