Skip to content

Instantly share code, notes, and snippets.

@buaahsh
Created February 1, 2018 01:59
Show Gist options
  • Save buaahsh/11df842c501c7138ecd62af5ae7a6115 to your computer and use it in GitHub Desktop.
Save buaahsh/11df842c501c7138ecd62af5ae7a6115 to your computer and use it in GitHub Desktop.
Jieba windows multiprocessing
from path import Path
from multiprocessing import Pool
import argparse
import time
LINE_PER_CORE = 5000
NUM_CORE = 30
FLOOR_COUNT = 10
CEIL_COUNT = 200
import jieba
def process_one(_in):
r_list = []
for l in _in:
new_l = ' '.join(jieba.cut(l))
r_list.append(new_l.strip())
return r_list
def do(l_list, writer):
pool = Pool(NUM_CORE)
r_list=pool.map(process_one,[l_list[it:it+LINE_PER_CORE] for it in range(0,len(l_list),LINE_PER_CORE)])
pool.close()
pool.join()
for lr in r_list:
for line in lr:
writer.write(line + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i","--input", help="input folder", default=".")
parser.add_argument("-o", "--output", help="output folder", default="w_process")
parser.add_argument("--LINE_PER_CORE", help="# lines per core", type=int, default=20000)
parser.add_argument("--NUM_CORE", help="# of cores", type=int, default=30)
parser.add_argument("--coding", type=str, default="utf-8")
args = parser.parse_args()
print("Args :", args)
input_folder = args.input
output_folder = args.output
LINE_PER_CORE = args.LINE_PER_CORE
NUM_CORE = args.NUM_CORE
coding = args.coding
if not Path(output_folder).exists():
Path(output_folder).mkdir()
for f in Path(input_folder).files('*.txt'):
print(f.basename(), time.strftime('%Y-%m-%d %X', time.localtime()))
with open(output_folder + '/%s.output.txt' % (f.namebase,),'w', encoding='utf-8') as f_out:
with open(f.abspath(),'r', encoding='utf-8') as f_in:
l_list=[]
all_dict = {}
for l in f_in:
if len(l_list)<NUM_CORE*LINE_PER_CORE:
l_list.append(l)
else:
do(l_list, f_out)
print(f.basename(), time.strftime('%Y-%m-%d %X', time.localtime()))
l_list=[]
if len(l_list)>0:
do(l_list, f_out)
print(time.strftime('%Y-%m-%d %X', time.localtime()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment