Python Text Processing

http://www.unixuser.org/~euske/python/pdfminer/index.html
http://zhufeng9282.blog.163.com/blog/static/291591002011112710830393/

mkdir pdfminer\cmap
python tools\conv_cmap.py pdfminer\cmap Adobe-CNS1 cmaprsrc\cid2code_Adobe_CNS1.txt cp950 big5
python tools\conv_cmap.py pdfminer\cmap Adobe-GB1 cmaprsrc\cid2code_Adobe_GB1.txt cp936 gb2312
python tools\conv_cmap.py pdfminer\cmap Adobe-Japan1 cmaprsrc\cid2code_Adobe_Japan1.txt cp932 euc-jp
python tools\conv_cmap.py pdfminer\cmap Adobe-Korea1 cmaprsrc\cid2code_Adobe_Korea1.txt cp949 euc-kr
python setup.py install

'''
Created on 2011-12-27
 
@author: zhufeng
'''
 
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
 
def convert(filename):
    outfile = filename+'.txt'
    args = [filename]
 
    rsrcmgr = PDFResourceManager()
    outfp = file(outfile,'w')
    device = TextConverter(rsrcmgr,outfp,codec='utf-8',laparams=LAParams())
 
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos = set(), maxpages = 0, password = '', check_extractable=True)
        fp.close()
 
    device.close()
    outfp.close()
 
if __name__ == '__main__':
    convert('g:/a.pdf')