MySQL script to import stackexchange data dumps (as retrieved from here).
The accompanying Ruby script will un-escape various XML-escaped characters that the MySQL import process fails to do.
| # Copyright (c) 2013 Georgios Gousios | |
| # MIT-licensed | |
| create database stackoverflow DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_general_ci; | |
| use stackoverflow; | |
| create table badges ( | |
| Id INT NOT NULL PRIMARY KEY, | |
| UserId INT, | |
| Name VARCHAR(50), | |
| CreationDate DATETIME | |
| ); | |
| CREATE TABLE comments ( | |
| Id INT NOT NULL PRIMARY KEY, | |
| PostId INT NOT NULL, | |
| Score INT NOT NULL DEFAULT 0, | |
| Text TEXT, | |
| CreationDate DATETIME, | |
| UserId INT NOT NULL | |
| ); | |
| CREATE TABLE post_history ( | |
| Id INT NOT NULL PRIMARY KEY, | |
| PostHistoryTypeId SMALLINT NOT NULL, | |
| PostId INT NOT NULL, | |
| RevisionGUID VARCHAR(36), | |
| CreationDate DATETIME, | |
| UserId INT NOT NULL, | |
| Text TEXT | |
| ); | |
| CREATE TABLE posts ( | |
| Id INT NOT NULL PRIMARY KEY, | |
| PostTypeId SMALLINT, | |
| AcceptedAnswerId INT, | |
| ParentId INT, | |
| Score INT NULL, | |
| ViewCount INT NULL, | |
| Body text NULL, | |
| OwnerUserId INT NOT NULL, | |
| LastEditorUserId INT, | |
| LastEditDate DATETIME, | |
| LastActivityDate DATETIME, | |
| Title varchar(256) NOT NULL, | |
| Tags VARCHAR(256), | |
| AnswerCount INT NOT NULL DEFAULT 0, | |
| CommentCount INT NOT NULL DEFAULT 0, | |
| FavoriteCount INT NOT NULL DEFAULT 0, | |
| CreationDate DATETIME | |
| ); | |
| CREATE TABLE users ( | |
| Id INT NOT NULL PRIMARY KEY, | |
| Reputation INT NOT NULL, | |
| CreationDate DATETIME, | |
| DisplayName VARCHAR(50) NULL, | |
| LastAccessDate DATETIME, | |
| Views INT DEFAULT 0, | |
| WebsiteUrl VARCHAR(256) NULL, | |
| Location VARCHAR(256) NULL, | |
| AboutMe TEXT NULL, | |
| Age INT, | |
| UpVotes INT, | |
| DownVotes INT, | |
| EmailHash VARCHAR(32) | |
| ); | |
| CREATE TABLE votes ( | |
| Id INT NOT NULL PRIMARY KEY, | |
| PostId INT NOT NULL, | |
| VoteTypeId SMALLINT, | |
| CreationDate DATETIME | |
| ); | |
| load xml infile 'Badges.xml' | |
| into table badges | |
| rows identified by '<row>'; | |
| load xml infile 'Comments.xml' | |
| into table comments | |
| rows identified by '<row>'; | |
| load xml infile 'PostHistory.xml' | |
| into table post_history | |
| rows identified by '<row>'; | |
| load xml infile 'Posts.xml' | |
| into table posts | |
| rows identified by '<row>'; | |
| load xml infile 'Users.xml' | |
| into table users | |
| rows identified by '<row>'; | |
| load xml infile 'Votes.xml' | |
| into table votes | |
| rows identified by '<row>'; | |
| create index badges_idx_1 on badges(UserId); | |
| create index comments_idx_1 on comments(PostId); | |
| create index comments_idx_2 on comments(UserId); | |
| create index post_history_idx_1 on post_history(PostId); | |
| create index post_history_idx_2 on post_history(UserId); | |
| create index posts_idx_1 on posts(AcceptedAnswerId); | |
| create index posts_idx_2 on posts(ParentId); | |
| create index posts_idx_3 on posts(OwnerUserId); | |
| create index posts_idx_4 on posts(LastEditorUserId); | |
| create index votes_idx_1 on votes(PostId); |
| #!/usr/bin/env ruby | |
| # Copyright (c) 2013 Georgios Gousios | |
| # MIT-licensed | |
| require 'sequel' | |
| require 'mysql2' | |
| require 'cgi' | |
| Sequel::Database.extension :pagination | |
| DB = Sequel.connect("mysql2://#{ARGV[1]}:#{ARGV[2]}@#{ARGV[0]}/#{ARGV[3]}") | |
| table = ARGV[4].to_sym | |
| field = ARGV[5].to_sym | |
| lines = 0 | |
| lines_with_escapes = 0 | |
| DB.from(table.to_sym).select(:Id, field).each_page(1000) do |page| | |
| page.each do |row| | |
| lines += 1 | |
| text = row[field] | |
| next if text.nil? | |
| text_unescaped = CGI.unescapeHTML(text) | |
| if text_unescaped != text | |
| lines_with_escapes += 1 | |
| DB[table].filter(:Id => row[:Id]).update(field => text_unescaped) | |
| end | |
| print "\r #{lines} lines, #{lines_with_escapes} with escapes" | |
| end | |
| end |
MySQL script to import stackexchange data dumps (as retrieved from here).
The accompanying Ruby script will un-escape various XML-escaped characters that the MySQL import process fails to do.
Hello from 2020 - one more comment about using nowadays... I would update the statements for the create tables to have CHARACTER SET = utf8; ... might save some trouble for some of these more exotic posts with emojis/symbols.
Two comments if you want to use this;
--secure_file_priv=to the mysqld startup options (orsecure_file_priv=in my.cnf under [mysqld])SET @@SESSION.SQL_MODE='';this is especially important for late 5.7 and 8.0 versions as they have theSTRICT_TRANS_TABLESSQL_MODE turned on by default (and this results in'ERROR 1263 (22004): Column set to default value; NULL supplied to NOT NULL column 'id' at row 1'. For more info, see https://dev.mysql.com/doc/refman/5.7/en/sql-mode.html#sqlmode_strict_trans_tables