Created
July 4, 2023 09:49
-
-
Save josifoski/9e45bfbfe73e58bc97562ad368f055c4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# script for scraping Bible texts from biblegateway.com | |
# creator: Aleksandar Josifoski for Troy Lyndon [email protected] property of RDGames http://rdgames.us | |
# 2016-04-01 | |
from bs4 import BeautifulSoup | |
import urllib.request | |
import re | |
import sys | |
import os | |
import time | |
import datetime | |
import html | |
import random | |
import codecs | |
import zipfile | |
## INPUT ###################################################################################################################################### | |
# First, take a look at biblegateway.com/versions or in bgtwkeys.txt (bgtwkeys.txt needs to be updated from time to time) | |
# and pick keys for which translations to be scrapped. | |
# Second, add in pool which Bible translations to be scrapped, put only Bible abbreviations prefixed with Language abbreeviation | |
# like pool = [ "EN-KJV", "EN-NKJV"] | |
# if you want to rescrape Bible, delete statusdone generated file where Bible is saved | |
pool = [ | |
'SR-ERV-SR','SV-SVL','SV-SV1917','SV-SFB','SV-SFB2014','SW-SNT','TA-ERV-TA','TH-TNCV','TH-ERV-TH','TL-ADB1905', | |
'TL-SND','TWI-NA-TWI','UK-UKR','UK-ERV-UK','UR-ERV-UR','USP-USP','VI-VIET','VI-BD2011','VI-NVB','VI-BPT', | |
'ZH-CCB','ZH-ERV-ZH','ZH-CNVS','ZH-CNVT','ZH-CSBS','ZH-CSBT','ZH-CUVS','ZH-CUV','ZH-CUVMPS','ZH-CUVMPT' | |
] | |
# if you want to scrape only one book, or some of them reduce bookslist | |
bookslist = [ "Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy", "Joshua", "Judges", "Ruth", "1 Samuel", "2 Samuel", | |
"1 Kings", "2 Kings", "1 Chronicles", "2 Chronicles", "Ezra", "Nehemiah", "Esther", "Job", "Psalms", "Proverbs", | |
"Ecclesiastes", "Song of Solomon", "Isaiah", "Jeremiah", "Lamentations", "Ezekiel", "Daniel", "Hosea", | |
"Joel", "Amos", "Obadiah", "Jonah", "Micah", "Nahum", "Habakkuk", "Zephaniah", "Haggai", "Zechariah", "Malachi", | |
"Matthew", "Mark", "Luke", "John", "Acts", "Romans", "1 Corinthians", "2 Corinthians", "Galatians", "Ephesians", | |
"Philippians", "Colossians", "1 Thessalonians", "2 Thessalonians", "1 Timothy", "2 Timothy", "Titus", "Philemon", "Hebrews", | |
"James", "1 Peter", "2 Peter", "1 John", "2 John", "3 John", "Jude", "Revelation" ] | |
#rootdir = '/data/rdgames/testing/bg/' | |
#xmlrootdir = '/data/rdgames/testing/bgxml/' | |
rootdir = '/home/josifoski/bibles/bg/' | |
xmlrootdir = '/home/josifoski/bibles/bgxml/' | |
# note for rootdir, last character must be / for first character put / for absolute path | |
# if you omit first character /, directories will be created in | |
# current directory where python scripts are | |
############################################################################################################################################### | |
dprefixes = { | |
"Genesis":"01", "Exodus":"02", "Leviticus":"03", "Numbers":"04", "Deuteronomy":"05", "Joshua":"06", "Judges":"07", "Ruth":"08", "1 Samuel":"09", | |
"2 Samuel":"10", "1 Kings":"11", "2 Kings":"12", "1 Chronicles":"13", "2 Chronicles":"14", "Ezra":"15", "Nehemiah":"16", "Esther":"17", "Job":"18", | |
"Psalms":"19", "Proverbs":"20", "Ecclesiastes":"21", "Song of Solomon":"22", "Isaiah":"23", "Jeremiah":"24", "Lamentations":"25", "Ezekiel":"26", | |
"Daniel":"27", "Hosea":"28", "Joel":"29", "Amos":"30", "Obadiah":"31", "Jonah":"32", "Micah":"33", "Nahum":"34", "Habakkuk":"35", "Zephaniah":"36", | |
"Haggai":"37", "Zechariah":"38", "Malachi":"39", "Matthew":"40", "Mark":"41", "Luke":"42", "John":"43", "Acts":"44", "Romans":"45", | |
"1 Corinthians":"46", "2 Corinthians":"47", "Galatians":"48", "Ephesians":"49", "Philippians":"50", "Colossians":"51", "1 Thessalonians":"52", | |
"2 Thessalonians":"53", "1 Timothy":"54", "2 Timothy":"55", "Titus":"56", "Philemon":"57", "Hebrews":"58", "James":"59", "1 Peter":"60", | |
"2 Peter":"61", "1 John":"62", "2 John":"63", "3 John":"64", "Jude":"65", "Revelation":"66" } | |
# if need to scrap only particular books, in bookslist above preserve only names of books for scrapping. This becarefull2 are reserve for copy/paste | |
becarefull2 = [ "Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy", "Joshua", "Judges", "Ruth", "1 Samuel", "2 Samuel", | |
"1 Kings", "2 Kings", "1 Chronicles", "2 Chronicles", "Ezra", "Nehemiah", "Esther", "Job", "Psalms", "Proverbs", | |
"Ecclesiastes", "Song of Solomon", "Isaiah", "Jeremiah", "Lamentations", "Ezekiel", "Daniel", "Hosea", | |
"Joel", "Amos", "Obadiah", "Jonah", "Micah", "Nahum", "Habakkuk", "Zephaniah", "Haggai", "Zechariah", "Malachi", | |
"Matthew", "Mark", "Luke", "John", "Acts", "Romans", "1 Corinthians", "2 Corinthians", "Galatians", "Ephesians", | |
"Philippians", "Colossians", "1 Thessalonians", "2 Thessalonians", "1 Timothy", "2 Timothy", "Titus", "Philemon", "Hebrews", | |
"James", "1 Peter", "2 Peter", "1 John", "2 John", "3 John", "Jude", "Revelation" ] | |
prefixes=('01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26', | |
'27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','43','44','45','46','47','48','49','50','51','52','53','54','55', | |
'56','57','58','59','60','61','62','63','64','65','66') | |
chaptersdict = { "Genesis" : 50, "Exodus" : 40, "Leviticus" : 27, "Numbers" : 36, "Deuteronomy" : 34, "Joshua" : 24, "Judges" : 21, "Ruth" : 4, | |
"1 Samuel" : 31, "2 Samuel" : 24, "1 Kings" : 22, "2 Kings" : 25, "1 Chronicles" : 29, "2 Chronicles" : 36, "Ezra" : 10, "Nehemiah" : 13, "Esther" : 10, | |
"Job" : 42, "Psalms" : 150, "Proverbs" : 31, "Ecclesiastes" : 12, "Song of Solomon" : 8, "Isaiah" : 66, "Jeremiah" : 52, "Lamentations" : 5, | |
"Ezekiel" : 48, "Daniel" : 12, "Hosea" : 14, "Joel" : 3, "Amos" : 9, "Obadiah" : 1, "Jonah" : 4, "Micah" : 7, "Nahum" : 3, "Habakkuk" : 3, | |
"Zephaniah" : 3, "Haggai" : 2, "Zechariah" : 14, "Malachi" : 4, "Matthew" : 28, "Mark" : 16, "Luke" : 24, "John" : 21, "Acts" : 28, "Romans" : 16, | |
"1 Corinthians" : 16, "2 Corinthians" : 13, "Galatians" : 6, "Ephesians" : 6, "Philippians" : 4, "Colossians" : 4, "1 Thessalonians" : 5, | |
"2 Thessalonians" : 3, "1 Timothy" : 6, "2 Timothy" : 4, "Titus" : 3, "Philemon" : 1, "Hebrews" : 13, "James" : 5, "1 Peter" : 5, "2 Peter" : 3, | |
"1 John" : 5, "2 John" : 1, "3 John" : 1, "Jude" : 1, "Revelation" : 22 } | |
Bibliaa = { | |
'Genesis' : ['1:31','2:25','3:24','4:26','5:32','6:22','7:24','8:22','9:29','10:32','11:32','12:20','13:18','14:24','15:21','16:16','17:27','18:33','19:38','20:18','21:34','22:24','23:20','24:67','25:34','26:35','27:46','28:22','29:35','30:43','31:55','32:32','33:20','34:31','35:29','36:43','37:36','38:30','39:23','40:23','41:57','42:38','43:34','44:34','45:28','46:34','47:31','48:22','49:33','50:26'], | |
'Exodus' : ['1:22','2:25','3:22','4:31','5:23','6:30','7:25','8:32','9:35','10:29','11:10','12:51','13:22','14:31','15:27','16:36','17:16','18:27','19:25','20:26','21:36','22:31','23:33','24:18','25:40','26:37','27:21','28:43','29:46','30:38','31:18','32:35','33:23','34:35','35:35','36:38','37:29','38:31','39:43','40:38'], | |
'Leviticus' : ['1:17','2:16','3:17','4:35','5:19','6:30','7:38','8:36','9:24','10:20','11:47','12:8','13:59','14:57','15:33','16:34','17:16','18:30','19:37','20:27','21:24','22:33','23:44','24:23','25:55','26:46','27:34'], | |
'Numbers' : ['1:54','2:34','3:51','4:49','5:31','6:27','7:89','8:26','9:23','10:36','11:35','12:16','13:33','14:45','15:41','16:50','17:13','18:32','19:22','20:29','21:35','22:41','23:30','24:25','25:18','26:65','27:23','28:31','29:40','30:16','31:54','32:42','33:56','34:29','35:34','36:13'], | |
'Deuteronomy' : ['1:46','2:37','3:29','4:49','5:33','6:25','7:26','8:20','9:29','10:22','11:32','12:32','13:18','14:29','15:23','16:22','17:20','18:22','19:21','20:20','21:23','22:30','23:25','24:22','25:19','26:19','27:26','28:68','29:29','30:20','31:30','32:52','33:29','34:12'], | |
'Joshua' : ['1:18','2:24','3:17','4:24','5:15','6:27','7:26','8:35','9:27','10:43','11:23','12:24','13:33','14:15','15:63','16:10','17:18','18:28','19:51','20:9','21:45','22:34','23:16','24:33'], | |
'Judges' : ['1:36','2:23','3:31','4:24','5:31','6:40','7:25','8:35','9:57','10:18','11:40','12:15','13:25','14:20','15:20','16:31','17:13','18:31','19:30','20:48','21:25'], | |
'Ruth' : ['1:22','2:23','3:18','4:22'], | |
'1Samuel' : ['1:28','2:36','3:21','4:22','5:12','6:21','7:17','8:22','9:27','10:27','11:15','12:25','13:23','14:52','15:35','16:23','17:58','18:30','19:24','20:42','21:15','22:23','23:29','24:22','25:44','26:25','27:12','28:25','29:11','30:31','31:13'], | |
'2Samuel' : ['1:27','2:32','3:39','4:12','5:25','6:23','7:29','8:18','9:13','10:19','11:27','12:31','13:39','14:33','15:37','16:23','17:29','18:33','19:43','20:26','21:22','22:51','23:39','24:25'], | |
'1Kings' : ['1:53','2:46','3:28','4:34','5:18','6:38','7:51','8:66','9:28','10:29','11:43','12:33','13:34','14:31','15:34','16:34','17:24','18:46','19:21','20:43','21:29','22:53'], | |
'2Kings' : ['1:18','2:25','3:27','4:44','5:27','6:33','7:20','8:29','9:37','10:36','11:21','12:21','13:25','14:29','15:38','16:20','17:41','18:37','19:37','20:21','21:26','22:20','23:37','24:20','25:30'], | |
'1Chronicles' : ['1:54','2:55','3:24','4:43','5:26','6:81','7:40','8:40','9:44','10:14','11:47','12:40','13:14','14:17','15:29','16:43','17:27','18:17','19:19','20:8','21:30','22:19','23:32','24:31','25:31','26:32','27:34','28:21','29:30'], | |
'2Chronicles' : ['1:17','2:18','3:17','4:22','5:14','6:42','7:22','8:18','9:31','10:19','11:23','12:16','13:22','14:15','15:19','16:14','17:19','18:34','19:11','20:37','21:20','22:12','23:21','24:27','25:28','26:23','27:9','28:27','29:36','30:27','31:21','32:33','33:25','34:33','35:27','36:23'], | |
'Ezra' : ['1:11','2:70','3:13','4:24','5:17','6:22','7:28','8:36','9:15','10:44'], | |
'Nehemiah' : ['1:11','2:20','3:32','4:23','5:19','6:19','7:73','8:18','9:38','10:39','11:36','12:47','13:31'], | |
'Esther' : ['1:22','2:23','3:15','4:17','5:14','6:14','7:10','8:17','9:32','10:3'], | |
'Job' : ['1:22','2:13','3:26','4:21','5:27','6:30','7:21','8:22','9:35','10:22','11:20','12:25','13:28','14:22','15:35','16:22','17:16','18:21','19:29','20:29','21:34','22:30','23:17','24:25','25:6','26:14','27:23','28:28','29:25','30:31','31:40','32:22','33:33','34:37','35:16','36:33','37:24','38:41','39:30','40:24','41:34','42:17'], | |
'Psalms' : ['1:6','2:12','3:8','4:8','5:12','6:10','7:17','8:9','9:20','10:18','11:7','12:8','13:6','14:7','15:5','16:11','17:15','18:50','19:14','20:9','21:13','22:31','23:6','24:10','25:22','26:12','27:14','28:9','29:11','30:12','31:24','32:11','33:22','34:22','35:28','36:12','37:40','38:22','39:13','40:17','41:13','42:11','43:5','44:26','45:17','46:11','47:9','48:14','49:20','50:23','51:19','52:9','53:6','54:7','55:23','56:13','57:11','58:11','59:17','60:12','61:8','62:12','63:11','64:10','65:13','66:20','67:7','68:35','69:36','70:5','71:24','72:20','73:28','74:23','75:10','76:12','77:20','78:72','79:13','80:19','81:16','82:8','83:18','84:12','85:13','86:17','87:7','88:18','89:52','90:17','91:16','92:15','93:5','94:23','95:11','96:13','97:12','98:9','99:9','100:5','101:8','102:28','103:22','104:35','105:45','106:48','107:43','108:13','109:31','110:7','111:10','112:10','113:9','114:8','115:18','116:19','117:2','118:29','119:176','120:7','121:8','122:9','123:4','124:8','125:5','126:6','127:5','128:6','129:8','130:8','131:3','132:18','133:3','134:3','135:21','136:26','137:9','138:8','139:24','140:13','141:10','142:7','143:12','144:15','145:21','146:10','147:20','148:14','149:9','150:6'], | |
'Proverbs' : ['1:33','2:22','3:35','4:27','5:23','6:35','7:27','8:36','9:18','10:32','11:31','12:28','13:25','14:35','15:33','16:33','17:28','18:24','19:29','20:30','21:31','22:29','23:35','24:34','25:28','26:28','27:27','28:28','29:27','30:33','31:31'], | |
'Ecclesiastes' : ['1:18','2:26','3:22','4:16','5:20','6:12','7:29','8:17','9:18','10:20','11:10','12:14'], | |
'SongofSolomon' : ['1:17','2:17','3:11','4:16','5:16','6:13','7:13','8:14'], | |
'Isaiah' : ['1:31','2:22','3:26','4:6','5:30','6:13','7:25','8:22','9:21','10:34','11:16','12:6','13:22','14:32','15:9','16:14','17:14','18:7','19:25','20:6','21:17','22:25','23:18','24:23','25:12','26:21','27:13','28:29','29:24','30:33','31:9','32:20','33:24','34:17','35:10','36:22','37:38','38:22','39:8','40:31','41:29','42:25','43:28','44:28','45:25','46:13','47:15','48:22','49:26','50:11','51:23','52:15','53:12','54:17','55:13','56:12','57:21','58:14','59:21','60:22','61:11','62:12','63:19','64:12','65:25','66:24'], | |
'Jeremiah' : ['1:19','2:37','3:25','4:31','5:31','6:30','7:34','8:22','9:26','10:25','11:23','12:17','13:27','14:22','15:21','16:21','17:27','18:23','19:15','20:18','21:14','22:30','23:40','24:10','25:38','26:24','27:22','28:17','29:32','30:24','31:40','32:44','33:26','34:22','35:19','36:32','37:21','38:28','39:18','40:16','41:18','42:22','43:13','44:30','45:5','46:28','47:7','48:47','49:39','50:46','51:64','52:34'], | |
'Lamentations' : ['1:22','2:22','3:66','4:22','5:22'], | |
'Ezekiel' : ['1:28','2:10','3:27','4:17','5:17','6:14','7:27','8:18','9:11','10:22','11:25','12:28','13:23','14:23','15:8','16:63','17:24','18:32','19:14','20:49','21:32','22:31','23:49','24:27','25:17','26:21','27:36','28:26','29:21','30:26','31:18','32:32','33:33','34:31','35:15','36:38','37:28','38:23','39:29','40:49','41:26','42:20','43:27','44:31','45:25','46:24','47:23','48:35'], | |
'Daniel' : ['1:21','2:49','3:30','4:37','5:31','6:28','7:28','8:27','9:27','10:21','11:45','12:13'], | |
'Hosea' : ['1:11','2:23','3:5','4:19','5:15','6:11','7:16','8:14','9:17','10:15','11:12','12:14','13:16','14:9'], | |
'Joel' : ['1:20','2:32','3:21'], | |
'Amos' : ['1:15','2:16','3:15','4:13','5:27','6:14','7:17','8:14','9:15'], | |
'Obadiah' : ['1:21'], | |
'Jonah' : ['1:17','2:10','3:10','4:11'], | |
'Micah' : ['1:16','2:13','3:12','4:13','5:15','6:16','7:20'], | |
'Nahum' : ['1:15','2:13','3:19'], | |
'Habakkuk' : ['1:17','2:20','3:19'], | |
'Zephaniah' : ['1:18','2:15','3:20'], | |
'Haggai' : ['1:15','2:23'], | |
'Zechariah' : ['1:21','2:13','3:10','4:14','5:11','6:15','7:14','8:23','9:17','10:12','11:17','12:14','13:9','14:21'], | |
'Malachi' : ['1:14','2:17','3:18','4:6'], | |
'Matthew' : ['1:25','2:23','3:17','4:25','5:48','6:34','7:29','8:34','9:38','10:42','11:30','12:50','13:58','14:36','15:39','16:28','17:27','18:35','19:30','20:34','21:46','22:46','23:39','24:51','25:46','26:75','27:66','28:20'], | |
'Mark' : ['1:45','2:28','3:35','4:41','5:43','6:56','7:37','8:38','9:50','10:52','11:33','12:44','13:37','14:72','15:47','16:20'], | |
'Luke' : ['1:80','2:52','3:38','4:44','5:39','6:49','7:50','8:56','9:62','10:42','11:54','12:59','13:35','14:35','15:32','16:31','17:37','18:43','19:48','20:47','21:38','22:71','23:56','24:53'], | |
'John' : ['1:51','2:25','3:36','4:54','5:47','6:71','7:53','8:59','9:41','10:42','11:57','12:50','13:38','14:31','15:27','16:33','17:26','18:40','19:42','20:31','21:25'], | |
'Acts' : ['1:26','2:47','3:26','4:37','5:42','6:15','7:60','8:40','9:43','10:48','11:30','12:25','13:52','14:28','15:41','16:40','17:34','18:28','19:41','20:38','21:40','22:30','23:35','24:27','25:27','26:32','27:44','28:31'], | |
'Romans' : ['1:32','2:29','3:31','4:25','5:21','6:23','7:25','8:39','9:33','10:21','11:36','12:21','13:14','14:23','15:33','16:27'], | |
'1Corinthians' : ['1:31','2:16','3:23','4:21','5:13','6:20','7:40','8:13','9:27','10:33','11:34','12:31','13:13','14:40','15:58','16:24'], | |
'2Corinthians' : ['1:24','2:17','3:18','4:18','5:21','6:18','7:16','8:24','9:15','10:18','11:33','12:21','13:14'], | |
'Galatians' : ['1:24','2:21','3:29','4:31','5:26','6:18'], | |
'Ephesians' : ['1:23','2:22','3:21','4:32','5:33','6:24'], | |
'Philippians' : ['1:30','2:30','3:21','4:23'], | |
'Colossians' : ['1:29','2:23','3:25','4:18'], | |
'1Thessalonians' : ['1:10','2:20','3:13','4:18','5:28'], | |
'2Thessalonians' : ['1:12','2:17','3:18'], | |
'1Timothy' : ['1:20','2:15','3:16','4:16','5:25','6:21'], | |
'2Timothy' : ['1:18','2:26','3:17','4:22'], | |
'Titus' : ['1:16','2:15','3:15'], | |
'Philemon' : ['1:25'], | |
'Hebrews' : ['1:14','2:18','3:19','4:16','5:14','6:20','7:28','8:13','9:28','10:39','11:40','12:29','13:25'], | |
'James' : ['1:27','2:26','3:18','4:17','5:20'], | |
'1Peter' : ['1:25','2:25','3:22','4:19','5:14'], | |
'2Peter' : ['1:21','2:22','3:18'], | |
'1John' : ['1:10','2:29','3:24','4:21','5:21'], | |
'2John' : ['1:13'], | |
'3John' : ['1:14'], | |
'Jude' : ['1:25'], | |
'Revelation' : ['1:20','2:29','3:22','4:11','5:14','6:17','7:17','8:13','9:21','10:11','11:19','12:17','13:18','14:20','15:8','16:21','17:18','18:24','19:21','20:15','21:27','22:21'] | |
} | |
def splitversetext(snum, vtext): | |
global bib | |
global newabbrev | |
global fzgroupingsreff | |
global glava | |
global numoflines | |
global sline | |
global c | |
global dprefixes | |
global bookslist | |
global dabbrevbook | |
global numtobibleabbrev | |
fzgroupingsreff.write(dprefixes[c] + ';' + glava + ';' + snum + os.linesep) | |
sline = '' | |
l = vtext.split() | |
lsize = len(l) | |
try: | |
ileft = int(snum.split('-')[0]) | |
except: | |
ileft = int(snum.split('-')[0].strip('abcde')) | |
try: | |
iright = int(snum.split('-')[1]) | |
except: | |
iright = int(snum.split('-')[1].strip('abcde')) | |
diff = iright - ileft + 1 | |
k = 0 | |
j = int(lsize/diff) - 1 | |
for i in range(ileft, iright + 1): | |
if i != iright: | |
sline += '{' + glava + ':' + str(i) + '} ' + ' '.join(l[k:k + j + 1]) + os.linesep | |
numoflines += 1 | |
k += j + 1 | |
else: | |
sline += '{' + glava + ':' + str(i) + '} ' + ' '.join(l[k:]) + os.linesep | |
numoflines += 1 | |
now = datetime.datetime.now() | |
currentdate = str(now).split()[0].replace('-', '') | |
writeorappend = 'w' | |
headers = {} | |
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" | |
time1=time.time() | |
ibiblecount = 0 | |
for bib in pool: | |
ibiblecount += 1 | |
bib = str(bib) | |
bib = bib.strip() | |
if bib.count('-') > 1: | |
newabbrev = bib.replace('-', '_', 1) | |
newabbrev = newabbrev.replace('-', '') | |
else: | |
newabbrev = bib.replace('-', '_') | |
directoryf = rootdir + newabbrev.split('_')[0] + '/' + newabbrev + '/' | |
if not os.path.exists(directoryf): | |
os.makedirs(directoryf) | |
if os.path.exists(directoryf + newabbrev + '_statusdone'): | |
print('#' + str(ibiblecount) + '/' + str(len(pool)) + ' ' + newabbrev + ' already previously scrapped') | |
continue | |
directoryfxmls = xmlrootdir + bib.split('-')[0] + '/' | |
if not os.path.exists(directoryfxmls): | |
os.makedirs(directoryfxmls) | |
filenamexmlzip = directoryfxmls + newabbrev + '_xml_files_' + currentdate + '.zip' | |
zxml = zipfile.ZipFile(filenamexmlzip, "w") | |
time2 = time.time() | |
errors=0 | |
sfileintegrityname = directoryf + newabbrev + '_integrityinfos.txt' | |
fintegrity=open(sfileintegrityname, writeorappend) #in this file will be recorded translation/chapter informations of integrity of text fails | |
fintegrity.write('source: biblegateway.com/versions' + os.linesep) | |
szgroupingsreff = directoryf + newabbrev + '_groupingsreff.txt' | |
fzgroupingsreff = codecs.open(szgroupingsreff, writeorappend, 'utf-8') | |
groupings = 0 | |
for c in bookslist: | |
if c.strip() != '': | |
time3 = time.time() | |
c = c.strip() | |
sfilename = directoryf + dprefixes[c] + '-' + c.replace(' ','') + '.' + newabbrev + '.txt' | |
g=codecs.open(sfilename, writeorappend, 'utf-8') | |
#print() | |
print('#' + str(ibiblecount) + '/' + str(len(pool)) + ' ' + sfilename.split('/')[-1]) | |
#for i in [1]: | |
for i in range(chaptersdict[c]): #if you want to scrap only particular chapters, you can intervene in this line, for example for i in [2, 3, 5 ]: to scrap chapters 3, 4, 6 only | |
#range function is indexing from zero, so for example for i in range(50) will loop 0..49 | |
#but becareful, this script is intentioned for batch processing whole bibles, to not overwrite full bible texts with crampled partial texts. | |
#for that kind of reasons (chapters only scraping) you should replace line above with rootdir = '/yourpath/testing/' | |
#which will save in testing folder | |
url = "http://www.biblegateway.com/passage/?search=" + c.replace(' ','+') + "+" + str(i + 1) + "&version=" + '-'.join(bib.split('-')[1:]) | |
glava = str(i + 1) | |
# next block is new to prevent breaking from server while scraping | |
#print(str(i+1), end=' ') | |
#sys.stdout.flush() | |
try: | |
req = urllib.request.Request(url, headers = headers) | |
resp = urllib.request.urlopen(req) | |
so = str(resp.read().decode('utf-8')) | |
soup = BeautifulSoup(so, 'html.parser') | |
breakchapter = False | |
except: | |
breakchapter = True | |
# meaning of this is that chapter is not present on site, for scrapper to continue to work | |
# chapter will be skipped | |
if breakchapter: | |
fintegrity.write('! ' + sfilename.split('/')[-1] + ' chapter: ' + glava + ', do not exists on site' + os.linesep) | |
errors += 1 | |
else: | |
mydivs = soup.findAll("div", { "class" : "passage-text" }) | |
try: | |
time.sleep(0.1) | |
sobs = html.unescape(str(mydivs[0])) | |
except: | |
fintegrity.write('! ' + sfilename.split('/')[-1] + ' chapter: ' + glava + ', do not exists on site' + os.linesep) | |
errors += 1 | |
continue | |
# prevention for future with saving part where text is from original xml files | |
sfilenamexml = directoryfxmls + dprefixes[c] + '-' + '%03d' % int(i + 1) + '-' + c.replace(' ','') + '.' + newabbrev + '.xml' | |
fxml = codecs.open(sfilenamexml, 'w', 'utf-8') | |
fxml.write(html.unescape(sobs)) | |
fxml.close() | |
zxml.write(sfilenamexml, arcname = sfilenamexml.split('/')[-1]) | |
os.remove(sfilenamexml) | |
sobs = re.sub('(<sup class="versenum">)(.*?)(</sup>)', r'</span>\1\2\3<span class="text">', sobs, flags=re.UNICODE) | |
sobs = sobs.replace('<sup class="versenum">','SplittingForGod<sup class="versenum">') | |
lso = sobs.split('SplittingForGod') | |
if '</h3>' or '<h3>' in lso[0]: | |
isoup = BeautifulSoup(lso[0], 'html.parser') | |
tagsitext = isoup.findAll("h3") | |
siv1 = '' | |
for tag in tagsitext: | |
siv1 += tag.text.strip() + ' ' | |
siv1 = siv1.strip() | |
if siv1 != '': | |
g.write('{i' + glava + ':1} ' + siv1 + os.linesep) | |
lso[0] = re.sub('<h3>.*?</h3>', '', lso[0], flags = re.UNICODE) | |
numoflines = 0 | |
isoup = BeautifulSoup(lso[0], 'html.parser') | |
if isoup.find("sup", { "class" : "versenum" }) == None: | |
lso[0] = '<sup class="versenum"> 1 </sup>' + lso[0] | |
# fix broken parts without versenum tag | |
ind = len(lso) - 1 | |
while ind > 0: | |
isoup = BeautifulSoup(lso[ind], 'html.parser') | |
if isoup.find("sup", { "class" : "versenum" }) != None: | |
ind -= 1 | |
else: | |
lso[ind - 1] = lso[ind -1] + lso[ind] | |
del lso[ind] | |
ind -= 1 | |
# fix groups like 22-23; 23-24 in Genesis 27 EN-MSG for result 22-24 | |
lvn = [] | |
for item in lso: | |
isoup = BeautifulSoup(item, 'html.parser') | |
lvn.append(isoup.find("sup", { "class" : "versenum" }).text.strip()) | |
ind = 0 | |
while ind < (len(lso) -1 ): | |
if '-' in lvn[ind]: | |
gn1 = lvn[ind].split('-')[1] | |
gnleft = lvn[ind].split('-')[0] | |
else: | |
gn1 = lvn[ind] | |
gnleft = lvn[ind] | |
if '-' in lvn[ind + 1]: | |
gn2 = lvn[ind + 1].split('-')[0] | |
gnright = lvn[ind + 1].split('-')[1] | |
else: | |
gn2 = lvn[ind + 1] | |
gnright = lvn[ind + 1] | |
if gn1 == gn2: | |
lso[ind + 1] = lso[ind] + ' ' + lso[ind + 1] | |
del lso[ind] | |
lvn[ind + 1] = gnleft + '-' + gnright | |
del lvn[ind] | |
isoup = BeautifulSoup(lso[ind], 'html.parser') | |
isoup.find("sup", { "class" : "versenum" }).extract() | |
isoup.find("sup", { "class" : "versenum" }).extract() | |
lso[ind] = '<sup class="versenum"> ' + gnleft + '-' + gnright + ' </sup>' + str(isoup) | |
ind = ind - 1 | |
ind += 1 | |
for ind in range(len(lso)): | |
isoup = BeautifulSoup(lso[ind], 'html.parser') | |
if isoup.find("h3") != None: | |
htags = isoup.findAll("h3") | |
siv1 = '' | |
for tag in htags: | |
siv1 += tag.text.strip() + ' ' | |
siv1 = siv1.strip() | |
FilteredIntro = siv1 | |
bheading = True | |
isoup.find("h3").extract() | |
else: | |
bheading = False | |
vtag = isoup.find("sup", { "class" : "versenum" }) | |
try: | |
FilteredVerseNum = vtag.text | |
except: | |
FilteredVerseNum = '200' | |
FilteredVerseNum = FilteredVerseNum.strip() | |
FilteredVerseNum = FilteredVerseNum.replace(' ', '') | |
try: | |
isoup.find("span", {"class":"chapternum"}).extract() | |
except: | |
pass | |
try: | |
isoup.find("sup", {"class":"versenum"}).extract() | |
except: | |
pass | |
lt = [] | |
content_tags = isoup.findAll("span", {"class" : "text"}) | |
for tag in content_tags: | |
lt.append(tag.text) | |
FilteredVerseText = ' '.join(lt) | |
FilteredVerseText = FilteredVerseText.strip() | |
FilteredVerseText = re.sub(' +', ' ', FilteredVerseText, flags=re.UNICODE) | |
FilteredVerseText = re.sub(' ([?!.:;,])', r'\1', FilteredVerseText, flags=re.UNICODE) | |
FilteredVerseText = re.sub('([?!.:;,])(\w)', r'\1 \2', FilteredVerseText, flags=re.UNICODE) | |
FilteredVerseText = re.sub('([^ ])—', r'\1 —', FilteredVerseText, flags=re.UNICODE) | |
FilteredVerseText = re.sub('—([^ ])', r'— \1', FilteredVerseText, flags=re.UNICODE) | |
FilteredVerseText = re.sub('\[ ?\w+ ?\]', '', FilteredVerseText, flags=re.UNICODE) | |
FilteredVerseText = re.sub('( \d+,) (\d+[ ,;:.])', r'\1\2', FilteredVerseText, flags=re.UNICODE) | |
FilteredVerseText = re.sub('( \d+,) (\d+)$', r'\1\2', FilteredVerseText, flags=re.UNICODE) | |
FilteredVerseText = re.sub('([“ ‘])L ord', r'\1Lord', FilteredVerseText, flags=re.UNICODE) | |
FilteredVerseText = FilteredVerseText.replace('“ ','“') | |
FilteredVerseText = FilteredVerseText.replace(' ”','”') | |
FilteredVerseText = FilteredVerseText.replace('‘ ','‘') | |
FilteredVerseText = FilteredVerseText.replace(' ’','’') | |
FilteredVerseText = FilteredVerseText.replace(" 's", "'s") | |
if '-' in FilteredVerseNum: | |
groupings += 1 | |
splitversetext(FilteredVerseNum, FilteredVerseText) | |
else: | |
sline= '{' + glava + ':' + FilteredVerseNum + '} ' + FilteredVerseText + os.linesep | |
numoflines += 1 | |
if bheading: | |
if FilteredIntro != '': | |
try: | |
sline += '{i' + glava + ':' + str(int(FilteredVerseNum) +1) + '} ' + FilteredIntro + os.linesep | |
except: | |
try: | |
sline += '{i' + glava + ':' + str(int(FilteredVerseNum.split('-')[1]) +1) + '} ' + FilteredIntro + os.linesep | |
except: | |
sline += '{i' + glava + ':' + str(200) + '} ' + FilteredIntro + os.linesep | |
g.write(sline) | |
trebalinii=int(Bibliaa[c.replace(' ','')][int(glava)-1].split(':')[1]) | |
if numoflines != trebalinii: | |
fintegrity.write('! ' + sfilename.split('/')[-1] + ' ' + glava + ' ' + str(numoflines) + ' ' + str(trebalinii) + '\n') | |
errors += 1 | |
time.sleep(0.1 + random.uniform(0.1, 0.2)) | |
#print() | |
time4 = time.time() | |
h1hours = int((time4-time3)/3600) | |
h1min = int((time4-time3 - h1hours * 3600)/60) | |
h1sec = time4 - time3 - h1hours * 3600 - h1min * 60 | |
h2hours = int((time4-time2)/3600) | |
h2min = int((time4-time2 - h2hours * 3600)/60) | |
h2sec = time4 - time2 - h2hours * 3600 - h2min * 60 | |
h3hours = int((time4-time1)/3600) | |
h3min = int((time4-time1 - h3hours * 3600)/60) | |
h3sec = time4 - time1 - h3hours * 3600 - h3min * 60 | |
print("#%s/%s %s %s %dh:%dm:%ds / %dh:%dm:%ds / %dh:%dm:%ds" % (str(ibiblecount), str(len(pool)), newabbrev, c.replace(' ',''), h1hours, h1min, h1sec, h2hours, h2min, h2sec, h3hours, h3min, h3sec)) | |
g.close() | |
zxml.close() | |
#print() | |
#print('Total number of errors for ' + directoryf + newabbrev + ' : ' + str(errors) +'\n') | |
print("#%s/%s %s done. %dh:%dm:%ds / Total: %dh:%dm:%ds" % (str(ibiblecount), str(len(pool)), newabbrev, h2hours, h2min, h2sec, h3hours, h3min, h3sec)) | |
fintegrity.write('Total number of errors for ' + directoryf + newabbrev + ' : ' + str(errors) +'\n') | |
fintegrity.close() | |
fzgroupingsreff.close() | |
fstatusdone = open(directoryf + newabbrev + '_statusdone', 'w') | |
fstatusdone.close() | |
print('Done!') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment