103 lines
3.1 KiB
Python
103 lines
3.1 KiB
Python
import aspose.words as aw
|
||
import pdfkit
|
||
from docx import Document
|
||
from docx.oxml.ns import qn
|
||
import pandas as pd
|
||
# from docx.oxml import OxmlElement
|
||
|
||
|
||
def remove_mark_from_doc(doc):
|
||
# 移除aspose库的标记
|
||
def remove_images_from_element(element):
|
||
#移除嵌入的图片
|
||
for paragraph in element.paragraphs:
|
||
for run in paragraph.runs:
|
||
drawing_elements = run._element.findall(qn('w:drawing'))
|
||
for drawing in drawing_elements:
|
||
drawing.getparent().remove(drawing)
|
||
|
||
pict_elements = run._element.findall(qn('w:pict'))
|
||
for pict in pict_elements:
|
||
pict.getparent().remove(pict)
|
||
|
||
def remove_text_from_element(element, text_to_remove=None):
|
||
# 移除aspose的标记(此处为页脚处的)
|
||
if text_to_remove == None:
|
||
text_to_remove = "Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd."
|
||
|
||
for paragraph in element.paragraphs:
|
||
if text_to_remove in paragraph.text:
|
||
# 清除特定文本
|
||
paragraph.clear()
|
||
|
||
for section in doc.sections:
|
||
header = section.header
|
||
footer = section.footer
|
||
|
||
remove_images_from_element(header)
|
||
remove_text_from_element(footer)
|
||
|
||
#删除第一段落
|
||
if doc.paragraphs:
|
||
first_paragraph = doc.paragraphs[0]
|
||
p = first_paragraph._element
|
||
p.getparent().remove(p)
|
||
|
||
|
||
def html2docx_aspose(html_path, docx_path):
|
||
doc = aw.Document(html_path)
|
||
doc.save(docx_path,aw.SaveFormat.DOCX)
|
||
|
||
doc = Document(docx_path)
|
||
remove_mark_from_doc(doc)
|
||
doc.save(docx_path)
|
||
print("word报告生成成功!")
|
||
|
||
def html2pdf_pdfkit(html_path, pdf_path):
|
||
# 将HTML文件转换为PDF
|
||
options = {
|
||
'page-size': 'Letter',
|
||
'margin-top': '0.35in',
|
||
'margin-right': '0.75in',
|
||
'margin-bottom': '0.75in',
|
||
'margin-left': '0.75in',
|
||
'encoding': "UTF-8",
|
||
'no-outline': None,
|
||
'enable-local-file-access': None
|
||
}
|
||
|
||
pdfkit.from_file(html_path, pdf_path, options=options)
|
||
print("pdf报告生成成功!")
|
||
|
||
|
||
def html2excel_pandas(html_path, excel_path):
|
||
# 将HTML文件转换为EXCEL
|
||
tables = pd.read_html(html_path)
|
||
|
||
# 创建一个Excel工作簿
|
||
with pd.ExcelWriter(excel_path) as writer:
|
||
# 写入到Excel的不同工作表中
|
||
for i, df in enumerate(tables):
|
||
sheet_name = f'Sheet{i+1}'
|
||
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
||
|
||
print("excel文件生成成功!")
|
||
|
||
|
||
|
||
|
||
###############################备选方案
|
||
|
||
def html2docx_pypandoc(html_path, docx_path):
|
||
import pypandoc
|
||
pypandoc.convert_file(html_path, 'docx', outputfile=docx_path)
|
||
print("Word报告生成成功!")
|
||
|
||
|
||
def html2docx_spire(html_path, docx_path):
|
||
from spire.doc import FileFormat,XHTMLValidationType,Document
|
||
# from spire.doc.common import *
|
||
document = Document()
|
||
document.LoadFromFile(html_path, FileFormat.Html, XHTMLValidationType.none)
|
||
document.SaveToFile(docx_path, FileFormat.Docx2016)
|
||
document.Close() |