#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Script to facilitate the import of a Readcube Papers 3 library into Zotero

__Purpose of this script__

If you export your Readcube (Mekentosj) Papers3 library as a BibTeX file, the file paths to the PDFs are not formatted
  correctly for Zotero to import them.

The specific issues include that:
* Papers3 does not export the file paths in a way that Zotero can understand.
* Papers3 does not export the paths to supplementary files, so only the primary PDF is imported into Zotero.
* Papers3 will export the primary PDF multiple times so you'll end up with multiple copies of the same PDF in Zotero.
* Papers3 includes superfluous supplementary files that you typically don't want to import into Zotero (e.g. *.html and
  *.webarchive files).

This script will take the BibTeX file you exported from Papers3 and modify the file paths so that they can be imported into
  Zotero.

__Usage__

This script takes as input a BibTeX library exported from readcube/mekentosj Papers3 and outputs a BibTeX library for Zotero
  to import.
The script preserves your Papers citekeys, adds supplementary files from the Papers3 Library, removes duplicate links to
  PDFs, and removes extraneous *.html and *.webarchive files that are often created by importing articles into Paper from
  a web browser.

__Instructions__

* Make sure to have Better BibTeX pre-installed to Zotero if you want to preserve the Papers citekeys.

* Export your Papers3 library as a *.bib file.
  Export > BibTeX Library
  Make sure to set the "BibTex Record" option to "Complete". This will cause papers to include the paths to the main PDF
    (or whatever) file in the *.bib export

* Run this script with python 3.7 or higher to generate the file, 'zotero_import.bib', in the same location as the BibTeX
  library export.

    * You can pass the script the paths to the Papers3 library and the BibTeX library export as command line arguments,
      e.g.:

      python Papers3_to_Zotero.py --papers "~/Documents/Library.papers3" --bibtex "~/Desktop/Library.bib"

    * Or you can modify the script by updating the 'papers_lib_hardcoded' and 'bibtex_lib_hardcoded' variables with the
      paths to your Papers3 library and the BibTeX library that you just exported. E.g.:

      papers_lib_hardcoded = "~/Documents/User Library/Library.papers3" ### Path to Papers3 Library
      bibtex_lib_hardcoded = "~/Desktop/full_library_export.bib" ### Path to Papers BibTeX library export

* Running the script will generate a new BibTeX file, 'zotero_import.bib', in the same location as the BibTeX library
  export.

* Import the 'zotero_import.bib' file that gets generated with Zotero.

* Be sure to check the 'Import errors found:' file if Zotero generates one (if it exists, it will be in whatever folder you
  imported the library to; sort by title to find it).

* Also check that special characters in titles and journal names were imported correctly. Sometimes '{\&}' in the
  zotero_import.bib will be imported as '<span class="nocase">&</span>'. I'm not sure why or when this happens. You can
  search for "</span>" to check.

__NOTE__

The Collections groupings are not preserved with this method. This is one way to manually get your Papers3 Collections into
  Zotero after following the above instructions:

* Export each collection as a BibTex library ("Export" set to "Selected Collection" and "BibTex Record" set to "Standard").
  This will prevent any file paths from being included in the *.bib file.

* Import that *.bib file directly to Zotero with the option to "Place imported collections and items into new collection"
  selected.

* Then merge the duplicate records. That will give you a new collection with links to right papers from your Zotero library.

* In this strategy, you have to do that for each one of your Papers3 Collections. Not ideal but maybe tolerable.

__Author__
Dae Houlihan

__Source__
https://gist.github.com/daeh/abc6d46d897b58a657699fa1a408573e
"""

import argparse
import re
import sys

from pathlib import Path
from warnings import warn


def main(papers=None, bibtex=None):
    ################################################
    ### Update these paths or pass via command line:
    ################################################

    ### Path to Papers3 Library ###
    papers_lib_hardcoded = "~/Documents/Library.papers3"

    ### Path to the BibTeX export of the Papers3 Library ###
    bibtex_lib_hardcoded = "~/Desktop/library.bib"

    ################################################

    papers_lib = papers_lib_hardcoded if papers is None else papers
    bibtex_lib = bibtex_lib_hardcoded if bibtex is None else bibtex
    papers_library = Path(papers_lib).expanduser()
    bibtex_library = Path(bibtex_lib).expanduser()

    papers_library_string = str(papers_library).replace(r"(", r"\(").replace(r")", r"\)") + r"/"

    if papers_library_string[-9:] != ".papers3/":
        raise Exception(
            f"The variable 'papers_library' should end in with '.papers3' but is rather: \n\t{str(papers_library)}"
        )
    if not papers_library.is_dir():
        raise Exception(
            f"The path you provided to the Papers3 library does not seem to exist or is not a directory: \n\t{str(papers_library)}"
        )
    if not (bibtex_library.is_file() and bibtex_library.suffix == ".bib"):
        raise Exception(
            f"The path you provided to the BibTeX Library file you exported from Papers3 does not seem to exist or is not '.bib' file: \n\t{str(bibtex_library)}"
        )

    out, missing = list(), list()
    with open(bibtex_library, "r") as btlib:
        for line in btlib:
            if line.startswith("file = {"):
                templine = re.sub(r"^file = {{(.*?)}},?", r"file = {\1},", line, flags=re.M)
                newline = re.sub(r"^file = {(.*?);(\1)},?", r"file = {\1},", templine, flags=re.M)
                assert ";" not in newline  # assert that this line references only one file

                search_str = r"^file = {.*?:" + papers_library_string + r"(.*?\..*?):(.*?/.*?)},?"
                filepath_relative = re.search(search_str, newline)
                assert isinstance(
                    filepath_relative, re.Match
                ), f"Unable to match regex expression:: \n{search_str} \nwith entry from BibTex:: \n{newline}"

                primary_file_path = papers_library / filepath_relative.group(1)

                if not primary_file_path.is_file():
                    warn(f"The linked file was not found: {primary_file_path}", UserWarning)
                    missing.append(primary_file_path)

                supp_files = list()
                for dir_extra in ["Supplemental", "Media"]:
                    supp_dir = primary_file_path.parents[0] / dir_extra
                    if supp_dir.exists():
                        for x in supp_dir.iterdir():
                            if (
                                x.is_file()
                                and x.suffix not in [".html", ".webarchive"]
                                and str(x) != str(primary_file_path)
                            ):
                                supp_files.append(x)

                        if len(supp_files) > 0:
                            search_str_supp = (
                                r"(^file = {.*?:" + papers_library_string + r".*?\..*?:application/.*?)},?"
                            )
                            primary_line = re.search(search_str_supp, newline)
                            assert isinstance(
                                primary_line, re.Match
                            ), f"Unable to match regex expression:: \n{search_str_supp} \nwith entry from BibTex:: \n{newline}"

                            newline = primary_line.group(1)
                            for x in supp_files:
                                print(f"adding supplementary file for {x.name}")
                                newline += f';{x.with_suffix("").name + " Supp" + x.suffix}:{x}:application/{x.suffix}'
                            newline += "},\n"

                out.append(newline)

            else:
                out.append(line)

    ### New BibTeX record to import into Zotero
    modified_lib = bibtex_library.parents[0] / "zotero_import.bib"
    with open(modified_lib, "w", encoding="utf-8") as outfile:
        for item in out:
            outfile.write(item)

    if missing:
        print("\n\nList of missing files::\n")
        for mf in missing:
            print(mf)
        print(
            f"\n\nScript completed but {len(missing)} files referenced in the BibTeX library were not located. They are listed above."
        )
    else:
        print(
            f"\n\nScript appears to have completed successfully. You can now import this file into Zotero (make sure Better BibTeX is already installed): \n\t{str(modified_lib)}"
        )

    return 0


def _cli():
    parser = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter, argument_default=argparse.SUPPRESS
    )
    parser.add_argument("-p", "--papers", help="Path to Papers3 Library")
    parser.add_argument("-b", "--bibtex", help="Path to the BibTeX export")
    args = parser.parse_args()
    return vars(args)


if __name__ == "__main__":
    sys.exit(main(**_cli()))