Scrapy - SQLite 中未创建 SQLalchemy 外键
Posted
技术标签:
【中文标题】Scrapy - SQLite 中未创建 SQLalchemy 外键【英文标题】:Scrapy - SQLalchemy Foreign Key not created in SQLite 【发布时间】:2020-08-22 08:18:26 【问题描述】:我尝试使用 itemLoader 运行 Scrapy 以收集所有数据并将它们放入 SQLite 3。我成功收集了我想要的所有信息,但我无法使用 @ 在我的 ThreadInfo 和 PostInfo 表中生成外键987654321@ 带外键。我确实尝试过back_ref
,但也没有用。
我的 Scrapy 完成后,所有其他信息都被插入到 SQLite 数据库中。
我的目标是让四个表相互链接,boardInfo、threadInfo、postInfo 和 authorInfo。
boardInfo 将与 threadInfo 具有一对多的关系 threadInfo 与 postInfo 是一对多的关系 authorInfo 将与 threadInfo 具有一对多的关系,并且 发布信息。我使用 DB Browser for SQLite 发现我的外键值是Null
。
我尝试查询值(threadInfo.boardInfos_id),它显示None
。我尝试解决这个问题很多天并通读文档但无法解决问题。
如何在我的 threadInfo 和 postInfo 表中生成外键?
感谢您的所有指导和 cmets。
这是我的models.py
from sqlalchemy import create_engine, Column, Table, ForeignKey, MetaData
from sqlalchemy import Integer, String, Date, DateTime, Float, Boolean, Text
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
from scrapy.utils.project import get_project_settings
Base = declarative_base()
def db_connect():
'''
Performs database connection using database settings from settings.py.
Returns sqlalchemy engine instance
'''
return create_engine(get_project_settings().get('CONNECTION_STRING'))
def create_table(engine):
Base.metadata.create_all(engine)
class BoardInfo(Base):
__tablename__ = 'boardInfos'
id = Column(Integer, primary_key=True)
boardName = Column('boardName', String(100))
threadInfosLink = relationship('ThreadInfo', back_populates='boardInfosLink') # One-to-Many with threadInfo
class ThreadInfo(Base):
__tablename__ = 'threadInfos'
id = Column(Integer, primary_key=True)
threadTitle = Column('threadTitle', String())
threadLink = Column('threadLink', String())
threadAuthor = Column('threadAuthor', String())
threadPost = Column('threadPost', Text())
replyCount = Column('replyCount', Integer)
readCount = Column('readCount', Integer)
boardInfos_id = Column(Integer, ForeignKey('boardInfos.id')) # Many-to-One with boardInfo
boardInfosLink = relationship('BoardInfo', back_populates='threadInfosLink') # Many-to-One with boardInfo
postInfosLink = relationship('PostInfo', back_populates='threadInfosLink') # One-to-Many with postInfo
authorInfos_id = Column(Integer, ForeignKey('authorInfos.id')) # Many-to-One with authorInfo
authorInfosLink = relationship('AuthorInfo', back_populates='threadInfosLink') # Many-to-One with authorInfo
class PostInfo(Base):
__tablename__ = 'postInfos'
id = Column(Integer, primary_key=True)
postOrder = Column('postOrder', Integer, nullable=True)
postAuthor = Column('postAuthor', Text(), nullable=True)
postContent = Column('postContent', Text(), nullable=True)
postTimestamp = Column('postTimestamp', Text(), nullable=True)
threadInfos_id = Column(Integer, ForeignKey('threadInfos.id')) # Many-to-One with threadInfo
threadInfosLink = relationship('ThreadInfo', back_populates='postInfosLink') # Many-to-One with threadInfo
authorInfos_id = Column(Integer, ForeignKey('authorInfos.id')) # Many-to-One with authorInfo
authorInfosLink = relationship('AuthorInfo', back_populates='postInfosLink') # Many-to-One with authorInfo
class AuthorInfo(Base):
__tablename__ = 'authorInfos'
id = Column(Integer, primary_key=True)
threadAuthor = Column('threadAuthor', String())
postInfosLink = relationship('PostInfo', back_populates='authorInfosLink') # One-to-Many with postInfo
threadInfosLink = relationship('ThreadInfo', back_populates='authorInfosLink') # One-to-Many with threadInfo
这是我的 pipelines.py
from sqlalchemy import exists, event
from sqlalchemy.orm import sessionmaker
from scrapy.exceptions import DropItem
from .models import db_connect, create_table, BoardInfo, ThreadInfo, PostInfo, AuthorInfo
from sqlalchemy.engine import Engine
from sqlite3 import Connection as SQLite3Connection
import logging
@event.listens_for(Engine, "connect")
def _set_sqlite_pragma(dbapi_connection, connection_record):
if isinstance(dbapi_connection, SQLite3Connection):
cursor = dbapi_connection.cursor()
cursor.execute("PRAGMA foreign_keys=ON;")
# print("@@@@@@@ PRAGMA prog is running!! @@@@@@")
cursor.close()
class DuplicatesPipeline(object):
def __init__(self):
'''
Initializes database connection and sessionmaker.
Creates tables.
'''
engine = db_connect()
create_table(engine)
self.Session = sessionmaker(bind=engine)
logging.info('****DuplicatesPipeline: database connected****')
def process_item(self, item, spider):
session = self.Session()
exist_threadLink = session.query(exists().where(ThreadInfo.threadLink == item['threadLink'])).scalar()
exist_thread_replyCount = session.query(ThreadInfo.replyCount).filter_by(threadLink = item['threadLink']).scalar()
if exist_threadLink is True: # threadLink is in DB
if exist_thread_replyCount < item['replyCount']: # check if replyCount is more?
return item
session.close()
else:
raise DropItem('Duplicated item found and replyCount is not changed')
session.close()
else: # New threadLink to be added to BoardPipeline
return item
session.close()
class BoardPipeline(object):
def __init__(self):
'''
Initializes database connection and sessionmaker
Creates tables
'''
engine = db_connect()
create_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
'''
Save scraped info in the database
This method is called for every item pipeline component
'''
session = self.Session()
# Input info to boardInfos
boardInfo = BoardInfo()
boardInfo.boardName = item['boardName']
# Input info to threadInfos
threadInfo = ThreadInfo()
threadInfo.threadTitle = item['threadTitle']
threadInfo.threadLink = item['threadLink']
threadInfo.threadAuthor = item['threadAuthor']
threadInfo.threadPost = item['threadPost']
threadInfo.replyCount = item['replyCount']
threadInfo.readCount = item['readCount']
# Input info to postInfos
# Due to info is in list, so we have to loop and add it.
for num in range(len(item['postOrder'])):
postInfoNum = 'postInfo' + str(num)
postInfoNum = PostInfo()
postInfoNum.postOrder = item['postOrder'][num]
postInfoNum.postAuthor = item['postAuthor'][num]
postInfoNum.postContent = item['postContent'][num]
postInfoNum.postTimestamp = item['postTimestamp'][num]
session.add(postInfoNum)
# Input info to authorInfo
authorInfo = AuthorInfo()
authorInfo.threadAuthor = item['threadAuthor']
# check whether the boardName exists
exist_boardName = session.query(exists().where(BoardInfo.boardName == item['boardName'])).scalar()
if exist_boardName is False: # the current boardName does not exists
session.add(boardInfo)
# check whether the threadAuthor exists
exist_threadAuthor = session.query(exists().where(AuthorInfo.threadAuthor == item['threadAuthor'])).scalar()
if exist_threadAuthor is False: # the current threadAuthor does not exists
session.add(authorInfo)
try:
session.add(threadInfo)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
【问题讨论】:
【参考方案1】:从我看到的代码来看,在我看来,您并没有在任何地方设置 ThreadInfo.authorInfosLink
或 ThreadInfo.authorInfos_id
(您的所有 FK/关系也是如此)。
对于要附加到 ThreadInfo 实例的相关对象,您需要创建它们,然后附加它们,如下所示:
# Input info to authorInfo
authorInfo = AuthorInfo()
authorInfo.threadAuthor = item['threadAuthor']
threadInfo.authorInfosLink = authorInfo
如果每个对象通过 FK 关联,您可能不想 session.add()。你会想要:
-
实例化一个
BoardInfo
对象bi
然后实例化附加你的相关ThreadInfo
对象ti
附加您的相关对象,例如bi.threadInfosLink = ti
在所有链式关系结束时,您只需使用 session.add(bi)
将 bi
添加到会话中 - 所有相关对象都将通过它们的关系添加,并且 FK 将是正确的。
【讨论】:
感谢您的解释。当我添加以下代码时,出现错误 TypeError: Incompatible collection type: ThreadInfo is not list-like。 'boardInfo.threadInfosLink = threadInfo' 当我添加'threadInfo.authorInfosLink = authorInfo'时,它可以工作,但这会在我的数据库中创建重复的'authorInfo.threadAuthor',并且我从authorInfo表中获得了FK作为主键。所以我的表中有一些重复的 authorName 并且 FK 链接了新创建的 authorName 的 PK。如何链接到相同的 BoardInfo.boardName 或 AuthorInfo.authorName? 因此,根据关系,如果它是多对一并且您决定将其置于何处,您可能需要将 .append() 附加到属性而不是分配相关对象给它。给定变量名称等,我很难阅读您的模型,所以我没有遵循您的意图,但是如果 Board 对象将包含线程集合,则为关系属性指定一个清晰的名称,例如threads = relationship('ThreadInfo', back_populates='boardInfosLink')
--那么如果你想将一个线程附加到 BoardInfo bi
它将是:bi.threads.append(ti)
或类似
我已经设法开始为您的评论工作。我需要threadInfo.authorInfosLink = authorInfo
来生成FK的链接。【参考方案2】:
根据我其他答案的 cmets 中的讨论,以下是我将如何合理化您的模型以使它们对我更有意义。
注意:
-
我已经删除了所有不必要的“信息”
我已从您的模型定义中删除了显式列名,并将依赖 SQLAlchemy 根据我的属性名称为我推断这些列名的能力
在“Post”对象中,我没有将属性命名为 PostContent,这暗示内容与 Post 相关,因为这是我们访问它的方式 - 而是简单地调用属性“Post”
我已删除所有“链接”术语 - 在我认为您希望引用相关对象集合的地方,我提供了该对象的复数属性作为关系。
我在 Post 模型中留下了一条线供您删除。如您所见,您不需要两次“作者”——一次作为相关对象,一次在 Post 上,这违背了 FK 的目的。
通过这些更改,当您尝试在其他代码中使用这些模型时,您需要在哪里使用 .append() 以及在哪里简单地分配相关对象变得很明显。对于给定的 Board 对象,您知道“线程”是仅基于属性名称的集合,因此您将执行类似 b.threads.append(thread)
from sqlalchemy import create_engine, Column, Table, ForeignKey, MetaData
from sqlalchemy import Integer, String, Date, DateTime, Float, Boolean, Text
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
class Board(Base):
__tablename__ = 'board'
id = Column(Integer, primary_key=True)
name = Column(String(100))
threads = relationship(back_populates='board')
class Thread(Base):
__tablename__ = 'thread'
id = Column(Integer, primary_key=True)
title = Column(String())
link = Column(String())
author = Column(String())
post = Column(Text())
reply_count = Column(Integer)
read_count = Column(Integer)
board_id = Column(Integer, ForeignKey('Board.id'))
board = relationship('Board', back_populates='threads')
posts = relationship('Post', back_populates='threads')
author_id = Column(Integer, ForeignKey('Author.id'))
author = relationship('Author', back_populates='threads')
class Post(Base):
__tablename__ = 'post'
id = Column(Integer, primary_key=True)
order = Column(Integer, nullable=True)
author = Column(Text(), nullable=True) # remove this line and instead use the relationship below
content = Column(Text(), nullable=True)
timestamp = Column(Text(), nullable=True)
thread_id = Column(Integer, ForeignKey('Thread.id'))
thread = relationship('Thread', back_populates='posts')
author_id = Column(Integer, ForeignKey('Author.id'))
author = relationship('Author', back_populates='posts')
class AuthorInfo(Base):
__tablename__ = 'author'
id = Column(Integer, primary_key=True)
name = Column(String())
posts = relationship('Post', back_populates='author')
threads = relationship('Thread', back_populates='author')
【讨论】:
以上是关于Scrapy - SQLite 中未创建 SQLalchemy 外键的主要内容,如果未能解决你的问题,请参考以下文章