Created
December 3, 2016 19:40
-
-
Save alllexx88/c4b2c1b96d2f6addbfffed4d763c05c6 to your computer and use it in GitHub Desktop.
Convert myrulib database to calibre format database. Contains database-specific parts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
SOURCE_DB="myrulib.db" | |
TARGET_DB="metadata_my.db" | |
#an empty Calibre database, with books and series | |
# create/update triggers removed | |
EMPTY_DB="metadata_empty.db" | |
LIB_ROOT="./" | |
function esc_q { | |
# double quotes escape themselves in SQL | |
sed 's/"/""/g' | |
} | |
function esc_sed { | |
# sed special characters escape | |
sed -e 's/\\/\\\\"/g' -e 's~[\.\*\^\$/]\|\[\|\]~\\&~g' | |
} | |
function my_cut { | |
local i="$1" | |
i=$((i - 1)) | |
sed -n "s/\([^¶]*¶\)\{$i\}\([^¶]*\).*/\2/p" | |
} | |
function get_date { | |
local arg="" | |
if [ -n "$1" ]; then | |
arg="--date=$1" | |
fi | |
date -u $arg "+%Y-%m-%d %H:%M:%S.%N%z" | sed 's/[0-9]\{2\}+0000$/+0000/' | |
} | |
function get_stamp { | |
get_date $(stat "${LIB_ROOT}$1" -c%y) | |
} | |
# turn comma-separated list of authors into | |
# newline separated one. Also remove some trash | |
# from authors names | |
function sanitize_authors { | |
sed \ | |
-e 's/, \?\([[:upper:]]\|d[ie][[:upper:]]\|v[ao]n \|d[ea] \|de[ls] \|te[nr]\?'` | |
`' \|tom \|де \|ван \|ди \|фон \)/\n\1/g' | sed \ | |
-e 's/, \?[ae]t al$//' -e 's/, ред$//' \ | |
-e 's/\([[:upper:]]\)\.\?[, ]*$/\1./' | |
} | |
#filter get ids from authors names (case-insensetive) | |
function auth_ids { | |
local auths=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g') | |
(sed -n -e "/^[^¶]*¶\(${auths}\)\$/Ip" | my_cut 1) <<< "${AUTHORS_FULL_NAMES}" | |
} | |
#same as auth_ids, but for series | |
function series_ids { | |
local series=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g') | |
(sed -n -e "/^[^¶]*¶\(${series}\)\$/Ip" | my_cut 1) <<< "${SERIES}" | |
} | |
#same as auth_ids, but for tags | |
function tag_ids { | |
local tags=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g') | |
(sed -n -e "/^[^¶]*¶\(${tags}\)\$/Ip" | my_cut 1) <<< "${TAGS}" | |
} | |
#get hierarchical tag from file path, e.g., | |
# "M_Mathematics/MC_Calculus/MCat_Advanced calculus/<file>.pdf" -> | |
# "Mathematics.Calculus.Advanced calculus" | |
function tag_from_filepath { | |
sed -e 's|^[^/]*_||' -e 's|/[^/]*_|/|g' -e 's|/[^/]*$||' -e 's|/|.|g' | |
} | |
#same as auth_ids, but for langs | |
function lang_ids { | |
local langs=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g') | |
(sed -n -e "/^[^¶]*¶\(${langs}\)\$/Ip" | my_cut 1) <<< "${LANGS}" | |
} | |
#feed stdin to sqlite, as well as log to "$1" | |
function feed_sql_and_log { | |
exec 3>>"$1" | |
rm -f "$1" | |
tee /dev/fd/3 | sqlite3 "${TARGET_DB}" | |
} | |
#the most painful part: process book entries and collect/feed the respective | |
# books, books_authors_link, books_series_link, books_tags_link, | |
# books_languages_link, comments SQL INSERT commands | |
function books_insert { | |
# BOOKS: | |
#<id>¶<title>¶<auths>¶<series>¶<file_path>¶<file_type>¶<md5sum>¶<lang>¶<year>¶<desc> | |
local book | |
local book_count=$(echo "${BOOKS}" | wc -l) | |
local n=$(echo -n "${book_count}" | wc -c) | |
(while IFS='' read -r book || [ -n "$book" ]; do | |
local book_id=$(echo "$book" | my_cut 1) | |
(awk "{printf \"[%07.4f]%% Inserting book %${n}d out of ${book_count}"` | |
`"\n\", (\$0*100)/${book_count}, \$0}" >&2) <<< "${book_id}" | |
local title=$(echo "$book" | my_cut 2 | esc_q) | |
local authors=$(echo "$book" | my_cut 3 | sanitize_authors) | |
local author_sort=$(echo "$authors" | \ | |
sed ':a;N;$!ba;s/\([^\n]*\)\n/\1 \& /g;s/.*/\L&/' | esc_q) | |
local series=$(echo "$book" | my_cut 4) | |
local file_path=$(echo "$book" | my_cut 5) | |
local file_type=$(echo "$book" | my_cut 6) | |
local md5sum=$(echo "$book" | my_cut 7) | |
local langs=$(echo "$book" | my_cut 8 | sed -e 's/,/\n/g') | |
local year=$(echo "$book" | my_cut 9) | |
local description=$(echo "$book" | my_cut 10 | esc_q | \ | |
sed 's/\\newline/\n/g') | |
local timestamp=$(get_stamp "${file_path}") | |
local ISBN=$(echo "${file_path}" | \ | |
sed -n 's|.*(ISBN \([0-9]*\))[^/]*$|\1|p') | |
#book hierarchical classification, extracted from file path | |
local tag=$(echo "${file_path}" | tag_from_filepath) | |
#books INSERT | |
echo -e "INSERT INTO books (id,title,sort,timestamp,pubdate,"` | |
`"author_sort,isbn,path,uuid,has_cover,"` | |
`"last_modified)\n"` | |
`"VALUES (${book_id},\"$title\",\"$title\",\"$timestamp\","` | |
`"\"$year\",\"${author_sort}\",\"$ISBN\",\"${file_path}\""` | |
`",\"$md5sum\",0,\"$timestamp\");" | |
#books_authors_link INSERT | |
auth_ids "$authors" | sed -n -e 's/.\+/'` | |
`"INSERT INTO books_authors_link (book,author)\n"` | |
`"VALUES (${book_id},&);/p" | |
#books_series_link INSERT | |
series_ids "$series" | sed -n -e 's/.\+/'` | |
`"INSERT INTO books_series_link (book,series)\n"` | |
`"VALUES (${book_id},&);/p" | |
#books_tags_link INSERT | |
tag_ids "$tag" | sed -n -e 's/.\+/'` | |
`"INSERT INTO books_tags_link (book,tag)\n"` | |
`"VALUES (${book_id},&);/p" | |
#books_languages_link INSERT | |
lang_ids "$langs" | sed -n -e 's/.\+/'` | |
`"INSERT INTO books_languages_link (book,lang_code,item_order)\n"` | |
`"VALUES (${book_id},&,0);/p" | |
#comments INSERT | |
if [ -n "${description}" ]; then | |
echo -e "INSERT INTO comments (book,text)\n"` | |
`"VALUES (${book_id},\"${description}\");" | |
fi | |
done <<< "${BOOKS}") | feed_sql_and_log "$1" | |
} | |
cp -f "${EMPTY_DB}" "${TARGET_DB}" | |
# AUTHORS_FULL_NAMES: '<id>¶<name>' lines | |
AUTHORS_FULL_NAMES=$(sqlite3 "${SOURCE_DB}" 'SELECT full_name FROM authors;' | \ | |
sanitize_authors | sort -uf | awk '{printf("%d¶%s\n", NR,$0)}') | |
# TAGS: '<id>¶<tag>' lines | |
TAGS=$(sqlite3 "${SOURCE_DB}" 'SELECT file_name FROM books;' | tag_from_filepath | \ | |
sort -uf | awk '{printf("%d¶%s\n", NR,$0)}') | |
# LANGS: '<id>¶<lang>' lines | |
LANGS=$(sqlite3 "${SOURCE_DB}" 'SELECT DISTINCT lang FROM books;' | \ | |
sed -e 's/,/\n/g' | sort -uf | awk '{printf("%d¶%s\n", NR,$0)}') | |
# BOOKS: | |
#'<id>¶<title>¶<authors>¶<sequence>¶<file_path>¶<file_type>¶<md5sum>¶<lang>¶<year>¶<description>' | |
BOOKS=$(sqlite3 -separator '<S_e^Pa_r^At_oR>' "${SOURCE_DB}" \ | |
'SELECT printf("<S_e^Pa_r^At_oR>%s",books.title),authors.full_name,'` | |
`'sequences.value,books.file_name,books.file_type,'` | |
`'books.md5sum,books.lang,books.year,books.description '` | |
`'FROM books,authors,sequences '` | |
`'WHERE books.id_author=authors.id AND books.id_sequence=sequences.id;' | \ | |
sed -e 's/<S_e^Pa_r^At_oR>/¶/g' | sed ':a;N;$!ba;s/\n\([^¶]\)/\\newline\1/g' | \ | |
awk '{printf("%d%s\n", NR,$0)}') | |
# SERIES: '<id>¶<name>' | |
SERIES=$(sqlite3 "${SOURCE_DB}" 'SELECT value FROM sequences;' | sed '/^\s*$/d' | \ | |
sort -uf | awk '{printf("%d¶%s\n", NR,$0)}') | |
#INSERT authors | |
echo "${AUTHORS_FULL_NAMES}" | esc_q | sed 's/\([^¶]*\)¶\(.*\)/'` | |
`'INSERT INTO authors (id,name,sort)\n'` | |
`'VALUES (\1,\"\2\",\"\L\2\");/' | \ | |
feed_sql_and_log authors_insert_command.txt | |
#INSERT tags | |
echo "${TAGS}" | esc_q | sed 's/\([^¶]*\)¶\(.*\)/'` | |
`'INSERT INTO tags (id,name)\nVALUES (\1,\"\2\");/' | \ | |
feed_sql_and_log tags_insert_command.txt | |
#INSERT languages | |
echo "${LANGS}" | sed 's/\([^¶]*\)¶\(.*\)/'` | |
`'INSERT INTO languages (id,lang_code)\n'` | |
`'VALUES (\1,\"\2\");/' | \ | |
feed_sql_and_log languages_insert_command.txt | |
#INSERT series | |
echo "${SERIES}" | esc_q | sed 's/\([^¶]*\)¶\(.*\)/'` | |
`'INSERT INTO series (id,name,sort)\n'` | |
`'VALUES (\1,\"\2\",\"\2\");/' | \ | |
feed_sql_and_log series_insert_command.txt | |
#INSERT books, books_authors_link, books_series_link, | |
# books_tags_link, books_languages_link, comments | |
books_insert books_insert_command.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment