Created
February 1, 2018 01:59
-
-
Save buaahsh/11df842c501c7138ecd62af5ae7a6115 to your computer and use it in GitHub Desktop.
Jieba windows multiprocessing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from path import Path | |
from multiprocessing import Pool | |
import argparse | |
import time | |
LINE_PER_CORE = 5000 | |
NUM_CORE = 30 | |
FLOOR_COUNT = 10 | |
CEIL_COUNT = 200 | |
import jieba | |
def process_one(_in): | |
r_list = [] | |
for l in _in: | |
new_l = ' '.join(jieba.cut(l)) | |
r_list.append(new_l.strip()) | |
return r_list | |
def do(l_list, writer): | |
pool = Pool(NUM_CORE) | |
r_list=pool.map(process_one,[l_list[it:it+LINE_PER_CORE] for it in range(0,len(l_list),LINE_PER_CORE)]) | |
pool.close() | |
pool.join() | |
for lr in r_list: | |
for line in lr: | |
writer.write(line + '\n') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-i","--input", help="input folder", default=".") | |
parser.add_argument("-o", "--output", help="output folder", default="w_process") | |
parser.add_argument("--LINE_PER_CORE", help="# lines per core", type=int, default=20000) | |
parser.add_argument("--NUM_CORE", help="# of cores", type=int, default=30) | |
parser.add_argument("--coding", type=str, default="utf-8") | |
args = parser.parse_args() | |
print("Args :", args) | |
input_folder = args.input | |
output_folder = args.output | |
LINE_PER_CORE = args.LINE_PER_CORE | |
NUM_CORE = args.NUM_CORE | |
coding = args.coding | |
if not Path(output_folder).exists(): | |
Path(output_folder).mkdir() | |
for f in Path(input_folder).files('*.txt'): | |
print(f.basename(), time.strftime('%Y-%m-%d %X', time.localtime())) | |
with open(output_folder + '/%s.output.txt' % (f.namebase,),'w', encoding='utf-8') as f_out: | |
with open(f.abspath(),'r', encoding='utf-8') as f_in: | |
l_list=[] | |
all_dict = {} | |
for l in f_in: | |
if len(l_list)<NUM_CORE*LINE_PER_CORE: | |
l_list.append(l) | |
else: | |
do(l_list, f_out) | |
print(f.basename(), time.strftime('%Y-%m-%d %X', time.localtime())) | |
l_list=[] | |
if len(l_list)>0: | |
do(l_list, f_out) | |
print(time.strftime('%Y-%m-%d %X', time.localtime())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment