pandas csv 到 psql:正在输出中生成附加列(?)
pandas csv to psql: additional column is being generated in output (?)
正在创建我的 .csv 生成的 psql 数据库,其中包含一个额外的列,我不明白为什么...
in.csv:
Box,Color,Contents`
1,Blue,"thing one [version 1] [dd/mm/yyyy]
thing two [version 1] [dd/mm/yyyy]
thing three [version 1] [dd/mm/yyyy]
2,Red,thing four [version 1] [dd/mm/yyyy]
3,Green,"thing five [version 1] [dd/mm/yyyy]
thing six [version 1] [dd/mm/yyyy]
convert.py(编辑)
## begin imports
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import pandas as pd
import numpy as np
import re
import sys
import csv
import time
## begin cellmaxmod
csv.field_size_limit(sys.maxsize)
maxInt = sys.maxsize
decrement = True
while decrement:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
decrement = False
try:
csv.field_size_limit(maxInt)
except OverflowError:
maxInt = int(maxInt/10)
decrement = True
## begin csv2df
df = pd.read_csv('in.csv', encoding='utf-8', engine='python', na_values=['.'])
## begin df2csv
df.to_csv('out.csv')
## begin csv2psql
class MyDB(object):
def __init__(self):
self.params = config()
def create_new_db(self, newdb):
user, host, port = self.params['user'], self.params['host'], self.params['port']
pw = self.params['password']
url = 'postgresql://{}:{}@{}:{}/{}'
url = url.format(user, pw, host, port, newdb)
self.engine = create_engine(url, client_encoding='utf8')
if not database_exists(self.engine.url):
create_database(self.engine.url)
# print(database_exists(engine.url))
def df2postgres(engine, df):
con = engine.connect()
df.to_sql(name='data', con=con, if_exists='replace', index=True, chunksize=10)
return con
if __name__ == '__main__':
testdb = MyDB()
testdb.create_new_db('converted')
engn = testdb.engine
df = pd.read_csv('out.csv', encoding='utf-8', engine='python', na_values=['.'])
pd.isnull(df)
out.csv(注意新列):
,Box,Color,Contents
0,1,Blue,"thing one [version 1] [dd/mm/yyyy]
thing two [version 1] [dd/mm/yyyy]
thing three [version 1] [dd/mm/yyyy]
1,2,Red,thing four [version 1] [dd/mm/yyyy]
2,3,Green,"thing five [version 1] [dd/mm/yyyy]
thing six [version 1] [dd/mm/yyyy]
psql 数据库的 pg4admin 截图:
问题:第二列在哪里? "Unamed: 0" 来自以及我如何停止它的生成或在创建数据库之前在 out.csv 内编写安全删除的脚本?
~tnx.
My .csv-generated psql database is being created with an extra column
and I cant figure out why...
out.csv
中的那个额外列实际上是数据帧索引。要删除它,您必须将 index=False
参数值传递给 to_csv()
方法。
df.to_csv('out.csv', index=False)
这将生成没有附加列的 out.csv
文件。
希望对您有所帮助:)
正在创建我的 .csv 生成的 psql 数据库,其中包含一个额外的列,我不明白为什么...
in.csv:
Box,Color,Contents`
1,Blue,"thing one [version 1] [dd/mm/yyyy]
thing two [version 1] [dd/mm/yyyy]
thing three [version 1] [dd/mm/yyyy]
2,Red,thing four [version 1] [dd/mm/yyyy]
3,Green,"thing five [version 1] [dd/mm/yyyy]
thing six [version 1] [dd/mm/yyyy]
convert.py(编辑)
## begin imports
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import pandas as pd
import numpy as np
import re
import sys
import csv
import time
## begin cellmaxmod
csv.field_size_limit(sys.maxsize)
maxInt = sys.maxsize
decrement = True
while decrement:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
decrement = False
try:
csv.field_size_limit(maxInt)
except OverflowError:
maxInt = int(maxInt/10)
decrement = True
## begin csv2df
df = pd.read_csv('in.csv', encoding='utf-8', engine='python', na_values=['.'])
## begin df2csv
df.to_csv('out.csv')
## begin csv2psql
class MyDB(object):
def __init__(self):
self.params = config()
def create_new_db(self, newdb):
user, host, port = self.params['user'], self.params['host'], self.params['port']
pw = self.params['password']
url = 'postgresql://{}:{}@{}:{}/{}'
url = url.format(user, pw, host, port, newdb)
self.engine = create_engine(url, client_encoding='utf8')
if not database_exists(self.engine.url):
create_database(self.engine.url)
# print(database_exists(engine.url))
def df2postgres(engine, df):
con = engine.connect()
df.to_sql(name='data', con=con, if_exists='replace', index=True, chunksize=10)
return con
if __name__ == '__main__':
testdb = MyDB()
testdb.create_new_db('converted')
engn = testdb.engine
df = pd.read_csv('out.csv', encoding='utf-8', engine='python', na_values=['.'])
pd.isnull(df)
out.csv(注意新列):
,Box,Color,Contents
0,1,Blue,"thing one [version 1] [dd/mm/yyyy]
thing two [version 1] [dd/mm/yyyy]
thing three [version 1] [dd/mm/yyyy]
1,2,Red,thing four [version 1] [dd/mm/yyyy]
2,3,Green,"thing five [version 1] [dd/mm/yyyy]
thing six [version 1] [dd/mm/yyyy]
psql 数据库的 pg4admin 截图:
问题:第二列在哪里? "Unamed: 0" 来自以及我如何停止它的生成或在创建数据库之前在 out.csv 内编写安全删除的脚本?
~tnx.
My .csv-generated psql database is being created with an extra column and I cant figure out why...
out.csv
中的那个额外列实际上是数据帧索引。要删除它,您必须将 index=False
参数值传递给 to_csv()
方法。
df.to_csv('out.csv', index=False)
这将生成没有附加列的 out.csv
文件。
希望对您有所帮助:)