我有以下代码将一个大的csv文件(超过350万行)加载到sqlite数据库中。
程序工作正常,但似乎无法释放内存,因此在运行程序时,我可以用命令看到
top
它的内存大小是如何增长的,直到耗尽所有可用的服务器内存,并且在不插入所有行的情况下终止程序。
我的理解是
db.commit()
(每当我们开始在csv中加载新的一个月时执行)将释放任何创建的烛台实例(据我推测,这些实例是使内存增长的实例),但它不会这样做。
为什么会发生这种情况?在没有内存泄漏的情况下,可以在代码中纠正什么以使其正常工作?
from decimal import *
from datetime import datetime
from pytz import timezone
from pony.orm import *
import csv
csv_filename = 'dax-1m.csv'
csv_timeframe = '1m'
csv_delimiter = ';'
csv_quotechar = '"'
csv_timezone = timezone('America/New_York')
db_filename = 'dax.db'
db_timezone = timezone('Europe/Berlin')
db = Database()
class Candlestick(db.Entity):
timeframe = Required(unicode)
timestamp = Required(datetime)
open = Required(Decimal, precision=12, scale=6)
high = Required(Decimal, precision=12, scale=6)
low = Required(Decimal, precision=12, scale=6)
close = Required(Decimal, precision=12, scale=6)
volume = Required(Decimal, precision=12, scale=6)
db.bind(provider='sqlite', filename=db_filename, create_db=True)
db.generate_mapping(create_tables=True)
class Loader():
def load(self):
rowcount = 0;
current_year = -1;
current_month = -1;
with open(csv_filename, newline='') as csvfile:
r = csv.reader(csvfile, delimiter=csv_delimiter, quotechar=csv_quotechar)
with db_session:
for row in r:
_year = int(row[0][-4:])
_month = int(row[0][3:-5])
_day = int(row[0][:2])
_hour = int(row[1][:2])
_minute = int(row[1][3:5])
csv_dt = datetime(_year, _month, _day, _hour, _minute)
db_dt = csv_timezone.localize(csv_dt).astimezone(db_timezone)
Candlestick(
timeframe=db_timezone.zone,
timestamp=db_dt,
open=row[2],
high=row[3],
low=row[4],
close=row[5],
volume=row[6]
)
rowcount+=1
if(_year != current_year or _month != current_month):
db.commit()
current_year = _year
current_month = _month
print('Loading data for ' + str(current_year) + ' ' + str(current_month) + ' ...')
print('Loaded ' + str(rowcount) + ' registers.')
ldr=Loader()
ldr.load();