python解析word文檔

發布時間: 2022-06-09 10:09:03

Ⅰ python word文件處理

#-*- encoding: utf8 -*-
import win32com
from win32com.client import Dispatch, constants
import win32com.client
import __main__
import os
import new
import sys
import re
import string
reload(sys)
sys.setdefaultencoding('utf8')
#from fileinput import filename

class Word(object):
#初始化word對象
def __init__(self, uri):
self.objectword(uri)

#創建word對象
def objectword(self,url):
self.word = win32com.client.Dispatch('Word.Application')
self.word.Visible = 0
self.word.DisplayAlerts = 0

self.docx = self.word.Documents.Open(url)
self.wrange = self.docx.Range(0, 0)

#關閉word
def close(self):
self.word.Documents.Close()
self.word.Quit()
#創建word
def create(self):

pass
#在word中進行查找
def findword(self, key):
question = []
uri = r'E:\XE\ctb.docx'
self.objectword(uri)
#讀取所有的word文檔內容
range = self.docx.Range(self.docx.Content.Start,self.docx.Content.End)
question = str(range).split("&")
#查找內容
#question = re.split(r"(\r[1][0-9][0-9]+.)",str(range))
#l = question[0].split("\d+.")
for questionLine in question:
questionLine = questionLine.strip('\n')
l = re.split(r"([1][0-9][0-9]+.)",questionLine)
del l[0]
for t in l:
s = str(key[0:3])
if str(t).find(s) > -1:
#插入
g = string.join(l)

print g.encode('gb2312')
#print g.decode("")
self.insertword(g)
print "sss"
else:
print "ttt"

#插入word
def insertword(self,w):
url = r'E:\XE\ctb.doc'
self.objectword(url)
self.wrange.InsertAfter(w)
pass

#讀取數據源
def source(self, src):
f = open(src)
d = f.readlines()
for l in d:
name, question01, question02, question03, question04, question05 = tuple(l.decode('utf8').split('\t'))
if question01 != u'全對':
#self.wrange.InsertAfter(name)
self.findword(question01)
return self

Word(r'E:\XE\xx.docx').source(r'E:\XE\xe.txt').close()

Ⅱ python能打開word文檔嗎

首先下載安裝win32com

from win32com import client as wc
word = wc.Dispatch('Word.Application')
doc = word.Documents.Open('c:/test')
doc.SaveAs('c:/test.text', 2)
doc.Close()
word.Quit()

這種方式產生的text文檔，不能用python用普通的r方式讀取，為了讓python可以用r方式讀取，應當寫成

doc.SaveAs('c:/test', 4)

注意：系統執行完成後，會自動產生文件後綴txt（雖然沒有指明後綴）。
在xp系統下面，應當，

open(r'c:\text','r')
wdFormatDocument = 0
wdFormatDocument97 = 0
wdFormatDocumentDefault = 16
wdFormatDOSText = 4
wdFormatDOSTextLineBreaks = 5
wdFormatEncodedText = 7
wdFormatFilteredHTML = 10
wdFormatFlatXML = 19
wdFormatFlatXMLMacroEnabled = 20
wdFormatFlatXMLTemplate = 21
= 22
wdFormatHTML = 8
wdFormatPDF = 17
wdFormatRTF = 6
wdFormatTemplate = 1
wdFormatTemplate97 = 1
wdFormatText = 2
wdFormatTextLineBreaks = 3
wdFormatUnicodeText = 7
wdFormatWebArchive = 9
wdFormatXML = 11
wdFormatXMLDocument = 12
= 13
wdFormatXMLTemplate = 14
= 15
wdFormatXPS = 18

照著字面意思應該能對應到相應的文件格式，如果你是office
2003可能支持不了這么多格式。word文件轉html有兩種格式可選wdFormatHTML、wdFormatFilteredHTML（對應數字
8、10），區別是如果是wdFormatHTML格式的話，word文件裡面的公式等ole對象將會存儲成wmf格式，而選用
wdFormatFilteredHTML的話公式圖片將存儲為gif格式，而且目測可以看出用wdFormatFilteredHTML生成的HTML
明顯比wdFormatHTML要干凈許多。

當然你也可以用任意一種語言通過com來調用office API，比如PHP.

from win32com import client as wc
word = wc.Dispatch('Word.Application')
doc = word.Documents.Open(r'c:/test1.doc')
doc.SaveAs('c:/test1.text', 4)
doc.Close()

import re
strings=open(r'c:\test1.text','r').read()
result=re.findall('\(\s*[A-D]\s*\)|\(\xa1*[A-D]\xa1*\)|\（\s*[A-D]\s*\）|\（\xa1*[A-D]\xa1*\）',strings)
chan=re.sub('\(\s*[A-D]\s*\)|\(\xa1*[A-D]\xa1*\)|\（\s*[A-D]\s*\）|\（\xa1*[A-D]\xa1*\）','()',strings)
question=open(r'c:\question','a+')
question.write(chan)
question.close()
answer=open(r'c:\answeronly','a+')
for i,a in enumerate(result):
m=re.search('[A-D]',a)
answer.write(str(i+1)+' '+m.group()+'\n')
answer.close()
chan=re.sub(r'\xa3\xa8\s*[A-D]\s*\xa3\xa9','()',strings)
#不要()，容易引起歧義。

Ⅲ python如何讀取word文件

>>>defPrintAllParagraphs(doc):
count=doc.Paragraphs.Count
foriinrange(count-1,-1,-1):
pr=doc.Paragraphs[i].Range
printpr.Text


>>>app=my.Office.Word.GetInstance()
>>>doc=app.Documents[0]
>>>PrintAllParagraphs(doc)

1.什麼是域

域應用基礎

>>>

@staticmethod
defGetInstance():
u'''獲取Word應用程序的Application對象'''
importwin32com.client
returnwin32com.client.Dispatch('Word.Application')

my.Office.Word.GetInstance的方法實現如上，是一個使用win32com操縱Word Com的介面的封裝
所有Paragraph即段落對象，都是通過Paragraph.Range.Text來訪問它的文字的

Ⅳ python處理word文檔

有個庫叫『Python-docx』
安裝之後 python 可以讀寫 word 文檔，就可以拼接了。

Ⅳ word圖片和文字文混排內容怎麼用python讀取寫入

Python可以利用python-docx模塊處理word文檔，處理方式是面向對象的。也就是說python-docx模塊會把word文檔，文檔中的段落、文本、字體等都看做對象，對對象進行處理就是對word文檔的內容處理。

二，相關概念
如果需要讀取word文檔中的文字（一般來說，程序也只需要認識word文檔中的文字信息），需要先了解python-docx模塊的幾個概念。

1，Document對象，表示一個word文檔。
2，Paragraph對象，表示word文檔中的一個段落
3，Paragraph對象的text屬性，表示段落中的文本內容。
三，模塊的安裝和導入
需要注意，python-docx模塊安裝需要在cmd命令行中輸入pip install python-docx，如下圖表示安裝成功（最後那句英文Successfully installed，成功地安裝完成，十分考驗英文水平。）

注意在導入模塊時，用的是import docx。

也真是奇了怪了，怎麼安裝和導入模塊時，很多都不用一個名字，看來是很有必要出一個python版本的模塊管理程序python-maven了，本段純屬PS。

四，讀取word文本
在了解了上面的信息之後，就很簡單了，下面先創建一個D:\temp\word.docx文件，並在其中輸入如下內容。

然後寫一段程序，代碼及輸出結果如下：

#讀取docx中的文本代碼示例
import docx
#獲取文檔對象
file=docx.Document("D:\\temp\\word.docx")
print("段落數:"+str(len(file.paragraphs)))#段落數為13，每個回車隔離一段

#輸出每一段的內容
for para in file.paragraphs:
print(para.text)

#輸出段落編號及段落內容
for i in range(len(file.paragraphs)):
print("第"+str(i)+"段的內容是："+file.paragraphs[i].text)
運行結果：

================ RESTART: F:/360data/重要數據/桌面/學習筆記/readWord.py ================
段落數:13
啊

我看見一座山

雄偉的大山

真高啊

啊

這座山是！

真的很高！
第0段的內容是：啊
第1段的內容是：
第2段的內容是：我看見一座山
第3段的內容是：
第4段的內容是：雄偉的大山
第5段的內容是：
第6段的內容是：真高啊
第7段的內容是：
第8段的內容是：啊
第9段的內容是：
第10段的內容是：這座山是！
第11段的內容是：
第12段的內容是：真的很高！
>>>
總結
以上就是本文關於Python讀取word文本操作詳解的全部內容，希望對大家有所幫助。感興趣的朋友可以繼續參閱本站其他相關專題，如有不足之處，歡迎留言指出。感謝朋友們對本站的支持！

Ⅵ 求助大神：如何用Python docx解析一個Word文檔，在某些欄位處插入文本或表格，更換頁眉頁腳等急~

from docx import Document
from docx.shared import Inches

document = Document()

document.add_heading('Document Title', 0)

p = document.add_paragraph('A plain paragraph having some ')
p.add_run('bold').bold = True
p.add_run(' and some ')
p.add_run('italic.').italic = True

document.add_heading('Heading, level 1', level=1)
document.add_paragraph('Intense quote', style='IntenseQuote')

document.add_paragraph(
'first item in unordered list', style='ListBullet'
)
document.add_paragraph(
'first item in ordered list', style='ListNumber'
)

document.add_picture('monty-truth.png', width=Inches(1.25))

table = document.add_table(rows=1, cols=3)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Qty'
hdr_cells[1].text = 'Id'
hdr_cells[2].text = 'Desc'
for item in recordset:
row_cells = table.add_row().cells
row_cells[0].text = str(item.qty)
row_cells[1].text = str(item.id)
row_cells[2].text = item.desc

document.add_page_break()

document.save('demo.docx')
這是一個demo for docx 你可以試試

Ⅶ python讀取word文檔內容

import fnmatch, os, sys, win32com.client

readpath=r'D:\123'

wordapp = win32com.client.gencache.EnsureDispatch("Word.Application")
try:
for path, dirs, files in os.walk(readpath):
for filename in files:
if not fnmatch.fnmatch(filename, '*.docx'):continue
doc = os.path.abspath(os.path.join(path,filename))
print 'processing %s...' % doc
wordapp.Documents.Open(doc)
docastext = doc[:-4] + 'txt'
wordapp.ActiveDocument.SaveAs(docastext,FileFormat=win32com.client.constants.wdFormatText)
wordapp.ActiveDocument.Close()
finally:
wordapp.Quit()
print 'end'

f=open(r'd:\123\test.txt','r')
for line in f.readlines():
print line.decode('gbk')
f.close()

Ⅷ 如何在 Linux 上使用 Python 讀取 word 文件信息

第一步：獲取doc文件的xml組成文件

import zipfiledef get_word_xml(docx_filename):
with open(docx_filename) as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
return xml_content

第二步：解析xml為樹形數據結構
from lxml import etreedef get_xml_tree(xml_string):
return etree.fromstring(xml_string)

第三步：讀取word內容：
def _itertext(self, my_etree):
"""Iterator to go through xml tree's text nodes"""
for node in my_etree.iter(tag=etree.Element):
if self._check_element_is(node, 't'):
yield (node, node.text)def _check_element_is(self, element, type_char):
word_schema = '99999'
return element.tag == '{%s}%s' % (word_schema,type_char)

Ⅸ 如何用python讀取word

使用Python的內部方法open()讀取文本文件

try:
f=open('/file','r')
print(f.read())
finally:
iff:
f.close()

如果讀取word文檔推薦使用第三方插件，python-docx 可以在官網上下載

使用方式

#-*-coding:cp936-*-
importdocx
document=docx.Document(文件路徑)
docText='

'.join([
paragraph.text.encode('utf-8')forparagraphindocument.paragraphs
])
printdocText

Ⅹ python如何讀取word文件中的文本內容並寫入到新的txt文件

閱讀全文

熱點內容

隨機啟動腳本發布：2025-07-05 16:10:30 瀏覽：532

微博資料庫設計發布：2025-07-05 15:30:55 瀏覽：30

linux485 發布：2025-07-05 14:38:28 瀏覽：310

php用的軟體發布：2025-07-05 14:06:22 瀏覽：760

沒有許可權訪問計算機發布：2025-07-05 13:29:11 瀏覽：436

javaweb開發教程視頻教程發布：2025-07-05 13:24:41 瀏覽：718

康師傅控流腳本破解發布：2025-07-05 13:17:27 瀏覽：246

java的開發流程發布：2025-07-05 12:45:11 瀏覽：692

怎麼看內存卡配置發布：2025-07-05 12:29:19 瀏覽：288

訪問學者英文個人簡歷發布：2025-07-05 12:29:17 瀏覽：837

python解析word文檔

與python解析word文檔相關的資訊