python [papers-anno2md] macOS Papers.app脚本:将论文PDF注释转换为Markdown注释并添加到常规注释部分

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python [papers-anno2md] macOS Papers.app脚本:将论文PDF注释转换为Markdown注释并添加到常规注释部分相关的知识,希望对你有一定的参考价值。

#!/usr/local/bin/python
# encoding: utf-8
"""
*Copy the general notes from selected papers in Papers.app to the clipoboard*

:Author:
    David Young

:Date Created:
    September 4, 2017

Usage:
    papers-copy-selected-papers-notes

Options:
    -h, --help            show this help message
"""
################# GLOBAL IMPORTS ####################
import sys
import os
import json
from os.path import expanduser
import sqlite3
import psutil
import base64
from polyglot.markdown import translate
from subprocess import Popen, PIPE, STDOUT
from operator import itemgetter
from fundamentals import tools

PAPERS_DB_PATH = "~/Library/Application Support/Papers3/Library_Data/C02N63X6G3QP__D741EA9B-BEF1-44A5-8087-4481F4EABAF3/Library.papers3/Database.papersdb"


def dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d


def main(arguments=None):
    """
    *The main function used when ``papers-cleanDB.py`` is run as a single script from the cl*
    """

    # SETUP THE COMMAND-LINE UTIL SETTINGS
    su = tools(
        arguments=arguments,
        docString=__doc__,
        logLevel="WARNING",
        options_first=False,
        projectName=False
    )
    arguments, settings, log, dbConn = su.setup()

    # UNPACK REMAINING CL ARGUMENTS USING `EXEC` TO SETUP THE VARIABLE NAMES
    # AUTOMATICALLY
    for arg, val in arguments.iteritems():
        if arg[0] == "-":
            varname = arg.replace("-", "") + "Flag"
        else:
            varname = arg.replace("<", "").replace(">", "")
        if isinstance(val, str) or isinstance(val, unicode):
            exec(varname + " = '%s'" % (val,))
        else:
            exec(varname + " = %s" % (val,))
        if arg == "--dbConn":
            dbConn = val
        log.debug('%s = %s' % (varname, val,))

    home = expanduser("~")
    papersDB = PAPERS_DB_PATH.replace("~/", home + "/")

    # DON'T CONTINUE IF PAPERS APP IS NOT OPEN
    openApp = False
    for pid in psutil.pids():
        p = psutil.Process(pid)
        if "papers" in p.name().lower():
            if "Papers.app" in p.exe():
                openApp = True
    if not openApp:
        log.warning(
            "Papers.app is not open, open the app before running this script")
        return

    paperUuids = grab_selected_paper_uuids_with_applescript(log)
    paperUuids = ("', '").join(paperUuids)

    # # CONNECT TO PAPERS DATABASE
    conn = sqlite3.connect(papersDB)
    conn.row_factory = dict_factory
    cursor = conn.cursor()

    # GENERATE THE QUERY TO GRAB THE NOTES FROM THE DATABASE
    query = """select notes from Publication where UUID in ('%(paperUuids)s');""" % locals(
    )

    cursor.execute(query)
    results = cursor.fetchall()

    allNotes = []
    for r in results:
        allNotes.append(r["notes"])

    allNotes = (u"\n\n").join(allNotes)

    cursor.close()

    print allNotes.encode("utf-8")

    return allNotes


def grab_selected_paper_uuids_with_applescript(
        log):
    """*grab a list of the UUIDs of the selected papers via applescript*

    **Key Arguments:**
        - ``log`` -- logger

    **Return:**
        - ``paperUuids`` -- list of uuids the pdf/publication annotation belong to (same length as ``papersJson``)
    """
    log.info('starting the ``grab_selected_paper_uuids_with_applescript`` function')

    applescript = """
        tell application "Papers"
            set uuidList to {}
            set allSelected to selected publications of front library window
            repeat with i from 1 to count of allSelected
                set end of uuidList to (id of item i of allSelected)
            end repeat
            return uuidList
        end tell
    """ % locals()
    cmd = "\n".join(["osascript << EOT", applescript, "EOT"])
    p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
    stdout, stderr = p.communicate()

    # SPLIT UUIDS INTO LIST
    paperUuids = stdout.strip()
    paperUuids = paperUuids.split(", ")

    log.info('completed the ``grab_selected_paper_uuids_with_applescript`` function')
    return paperUuids


if __name__ == '__main__':
    main()
#!/usr/local/bin/python
# encoding: utf-8
"""
*Convert Papers PDF Annotations to Markdown Notes and Add to the General Notes Section for the Publication*

:Author:
    David Young

:Date Created:
    August 30, 2017

Usage:
    papers-anno2md

Options:
    -h, --help            show this help message
"""
################# GLOBAL IMPORTS ####################
import sys
import os
import json
from os.path import expanduser
import sqlite3
import psutil
import base64
from polyglot.markdown import translate
from subprocess import Popen, PIPE, STDOUT
from operator import itemgetter
from fundamentals import tools

PAPERS_DB_PATH = "~/Library/Application Support/Papers3/Library_Data/C02N63X6G3QP__D741EA9B-BEF1-44A5-8087-4481F4EABAF3/Library.papers3/Database.papersdb"


def dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d


def main(arguments=None):
    """
    *The main function used when ``papers-cleanDB.py`` is run as a single script from the cl*
    """

    # SETUP THE COMMAND-LINE UTIL SETTINGS
    su = tools(
        arguments=arguments,
        docString=__doc__,
        logLevel="WARNING",
        options_first=False,
        projectName=False
    )
    arguments, settings, log, dbConn = su.setup()

    # UNPACK REMAINING CL ARGUMENTS USING `EXEC` TO SETUP THE VARIABLE NAMES
    # AUTOMATICALLY
    for arg, val in arguments.iteritems():
        if arg[0] == "-":
            varname = arg.replace("-", "") + "Flag"
        else:
            varname = arg.replace("<", "").replace(">", "")
        if isinstance(val, str) or isinstance(val, unicode):
            exec(varname + " = '%s'" % (val,))
        else:
            exec(varname + " = %s" % (val,))
        if arg == "--dbConn":
            dbConn = val
        log.debug('%s = %s' % (varname, val,))

    home = expanduser("~")
    papersDB = PAPERS_DB_PATH.replace("~/", home + "/")

    # DON'T CONTINUE IF PAPERS APP IS NOT OPEN
    openApp = False
    for pid in psutil.pids():
        p = psutil.Process(pid)
        if "papers" in p.name().lower():
            if "Papers.app" in p.exe():
                openApp = True
    if not openApp:
        log.warning(
            "Papers.app is not open, open the app before running this script")
        return

    papersJson, paperUuids = grab_papers_annotations_with_applescript(log)
    noteDict = compile_markdown_notes_from_annotations(
        papersJson, paperUuids, log)

    # CONNECT TO PAPERS DATABASE
    conn = sqlite3.connect(papersDB)
    conn.row_factory = dict_factory
    cursor = conn.cursor()

    add_notes_and_citations_to_papers(cursor, noteDict, log)

    cursor.close()

    return


def grab_papers_annotations_with_applescript(
        log):
    """*grab all annotations from the papers database via applescript - there's no easy way to order the annotations correctly when taken directly from the database*

    **Key Arguments:**
        - ``log`` -- logger

    **Return:**
        - ``papersJson`` -- list of json properties for every annotation
        - ``paperUuids`` -- list of uuids the pdf/publication annotation belong to (same length as ``papersJson``)        
    """
    log.info('starting the ``grab_papers_annotations_with_applescript`` function')

    applescript = """
        tell application "Papers"
            set thisList to {}
            set sourceList to {}
            set allAnnotations to every annotation item
            repeat with i from 1 to count of allAnnotations
                set end of thisList to json string of item i of allAnnotations
                set end of sourceList to (id of publication item of (source file of item i of allAnnotations))
            end repeat
            return thisList & "||||||" & sourceList
        end tell
    """ % locals()
    cmd = "\n".join(["osascript << EOT", applescript, "EOT"])
    p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
    stdout, stderr = p.communicate()

    # JSON PROPERTIES FOR EVERY ANNOTATION AND THE UUID OF THE PDF/PUBLICATION
    # IT BELONGS TO
    papersJson, paperUuids = stdout.split("||||||")
    papersJson = papersJson.strip()[:-1]
    papersJson = json.loads("[%(papersJson)s]" % locals())
    paperUuids = paperUuids.replace(" ", "").split(",")[1:]

    log.info('completed the ``grab_papers_annotations_with_applescript`` function')
    return papersJson, paperUuids


def compile_markdown_notes_from_annotations(
        papersJson,
        paperUuids,
        log):
    """*iterate through all annotations and compile markdown style notes for each publication*

    **Key Arguments:**
        - ``papersJson`` -- list of json properties for every annotation
        - ``paperUuids`` -- list of uuids the pdf/publication annotation belong to (same length as ``papersJson``) 
        - ``log`` -- logger

    **Return:**
        - ``noteDict`` -- a dictionary of markdown notes with publication uuids as keys    
    """
    log.info('starting the ``compile_markdown_notes_from_annotations`` function')

    md = translate(
        log=log,
        settings=False
    )

    # ASSIGN CORRECT PDF UUID & CONVERT ANNOTATIONS TO MD
    for u, j in zip(paperUuids, papersJson):

        j["uuid"] = u
        top = 1000000.
        if "rectangles" in j:
            for r in j["rectangles"]:
                this = float(r.split("}, {")[0].split(", ")[1])
                if this < top:
                    top = this
        elif "top" in j:
            top = j["top"]
        page_nr = j["page_nr"]
        top = 100000000. - top
        annIndex = "%(page_nr)s-%(top)s" % locals()
        j["annIndex"] = annIndex

        if j['type'] == 2:
            if "text" not in j or len(j['text'].strip()) == 0:
                j["md"] = ""
            elif j['color'] == 1:
                j["md"] = md.blockquote(j['text'])
            elif j['color'] == 2:
                j["md"] = md.codeblock(j['text'], "xxx")
            elif j['color'] == 3:
                j["md"] = j['text'].strip()
            elif j['color'] == 4:
                j["md"] = md.bold(j['text'])
            elif j['color'] == 5:
                j["md"] = md.header(j['text'], level=2)
            elif j['color'] == 6:
                j["md"] = md.em(j['text'])
        elif j['type'] == 0:
            if "contents" not in j:
                j["md"] = ""
            else:
                j["md"] = j['contents'].strip()
        elif j['type'] == 53:
            j["md"] = "**!!!TAKE A SCREENGRAB OF ANNOTATED IMAGE(S) IN PDF, CONVERT TO MD LINK AND PASTE INTO NOTE AT IMAGE LOCATION!!!**"

    papersJson = sorted(papersJson, key=itemgetter(
        'annIndex'), reverse=False)

    noteDict = {}
    for j in papersJson:
        if j["uuid"] not in noteDict:
            noteDict[j["uuid"]] = []
        if "md" in j:
            noteDict[j["uuid"]].append(j["md"])

    log.info('completed the ``compile_markdown_notes_from_annotations`` function')
    return noteDict


def add_notes_and_citations_to_papers(
        cursor,
        noteDict,
        log):
    """*add markdown notes and citations to the general notes section of papers (via sqlite database*

    **Key Arguments:**
        - ``cursor`` -- the database cursor
        - ``noteDict`` -- a dictionary of markdown notes with publication uuids as keys    
        - ``log`` -- logger
    """
    log.info('starting the ``add_notes_and_citations_to_papers`` function')

    # GRAB ALL UUIDs OF ALL PAPERS
    sqlQuery = """select uuid from Publication where subtitle is not null and subtitle != "MISSING" and author_year_string is not null;"""
    cursor.execute(sqlQuery)
    paperMeta = cursor.fetchall()

    # FOR EACH PAPER, GRAB METADATA TO FORM CITATION AND
    # ADD THE PRE-GENERATED NOTES TO THE METADATA DICTS
    for p in paperMeta:

        puid = p["uuid"]
        sqlQuery = """select remote_id as url from (select * from SyncEvent where device_ID = "%(puid)s" and remote_id like "http%%" order by updated_at) group by device_ID;""" % locals()
        cursor.execute(sqlQuery)
        results = cursor.fetchall()
        if len(results):
            p["url"] = results[0]["url"]
        else:
            p["url"] = None

        sqlQuery = """select title, author_string, author_year_string, type, subtype, publisher, label, rating, subtitle, summary, tag_string, kind  from Publication where uuid = "%(puid)s";""" % locals(
        )
        cursor.execute(sqlQuery)
        results = cursor.fetchall()
        if len(results):
            p.update(results[0])

        p["author_year_string"] = p["author_year_string"].replace(
            "(", "").replace(")", "")
        p["author_string"] = p["author_string"].title()
        p["citeKey"] = "#%(title)s%(author_string)s" % p
        p["citeKey"] = p["citeKey"].lower().replace(" ", "")
        p["year"] = p["author_year_string"][-5:-1]
        if p["rating"]:
            stars = [u"★★★★★", u"★★★★☆", u"★★★☆☆", u"★★☆☆☆", u"★☆☆☆☆"]
            numbers = [5, 4, 3, 2, 1]
            for s, n in zip(stars, numbers):
                if p["rating"] == n:
                    p["rating"] = s
        else:
            p["rating"] = ""

        if p["url"]:
            p["link"] = "*[%(title)s](%(url)s)*" % p
        else:
            p["link"] = "*%(title)s*" % p

        if "Journal" in p["subtitle"] or p["subtitle"] in ["Draft Paper", "Book", "Book Chapter"]:
            p["cite"] = """
[Not Cited][%(citeKey)s]

[%(citeKey)s]: %(rating)s **%(subtitle)s**: %(author_year_string)s, %(link)s, %(publisher)s.
            """ % p

        elif p["subtitle"] in ["Podcast", "Video"]:
            p["cite"] = u"""
[Not Cited][%(citeKey)s]

[%(citeKey)s]: %(rating)s **%(subtitle)s**: %(author_string)s, %(link)s, %(publisher)s.
            """ % p
        else:
            p["cite"] = """
[Not Cited][%(citeKey)s]

[%(citeKey)s]: %(rating)s **%(subtitle)s**: %(link)s, %(publisher)s.
            """ % p

        p["cite"] = p["cite"].replace(", None.", "").replace(
            "..", ".").replace("Journal Abstract", "Journal Article").replace(" Et Al.", " et al.").replace(" , ", " ")

        p["mdNote"] = ""
        if p["uuid"] in noteDict:
            p["mdNote"] = ("\n\n").join(noteDict[p["uuid"]]).replace(
                "\n\n\n", "\n\n").replace("\n\n\n", "\n\n").replace("\n\n\n", "\n\n") + "\n\n"
        p["mdNote"] = p["mdNote"] + p["cite"]
        p["mdNote"] = p["mdNote"].replace('"', '\"').replace("'", "''").strip()

    # NOW UPDATE EACH PAPER WITH THE NOTES AND CITATION INTO THE GENERAL
    # NOTE SECTION IN PAPERS APP
    sqlQueries = []
    sqlQueries.append("DROP TRIGGER _upd_log_Publication_notes;")
    for p in paperMeta:
        sqlQueries.append(
            """UPDATE Publication SET notes = '%(mdNote)s' where UUID = "%(uuid)s";""" % p)
    sqlQueries.append("""CREATE TRIGGER _upd_log_Publication_notes AFTER UPDATE OF notes ON Publication
    WHEN enableTransactionLog() AND (NEW."searchresult" = 0) AND (OLD."notes" IS NULL AND NEW."notes" IS NOT NULL OR OLD."notes" IS NOT NULL AND NEW."notes" IS NULL OR OLD."notes" <> NEW."notes") BEGIN
    DELETE FROM changeLog WHERE modUUID = NEW."uuid" AND modColumn="notes" AND modType=2; -- delete old changes immediately, for cleanup
    INSERT INTO changeLog (modifiedDate, modTable, modUUID, modType, modColumn, modValue, device, dbRevision) VALUES ((strftime("%%s", "now") + strftime("%%f", "now") - strftime("%%S", "now")), "Publication", NEW."uuid", 2, "notes", NEW."notes", device(), dbRevision());END""")

    for q in sqlQueries:
        try:
            cursor.execute(q)
        except Exception, e:
            log.warning('%(e)s' % locals())

    log.info('completed the ``add_notes_and_citations_to_papers`` function')
    return None


if __name__ == '__main__':
    main()

以上是关于python [papers-anno2md] macOS Papers.app脚本:将论文PDF注释转换为Markdown注释并添加到常规注释部分的主要内容,如果未能解决你的问题,请参考以下文章

ruby tp2md.rb

使用 XMPP 的推送通知和 C2MD 路线图

SSL_CTX_new:无法加载 ssl2 md5 例程

量子力学(2.5) 生肉

python基础--(hashlib,configparser,logging)模块功能

python基础--(hashlib,configparser,logging)模块功能