ReportGeneration/PythonReportGeneration/process/inference.py

103 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import aspose.words as aw
import pdfkit
from docx import Document
from docx.oxml.ns import qn
import pandas as pd
# from docx.oxml import OxmlElement
def remove_mark_from_doc(doc):
# 移除aspose库的标记
def remove_images_from_element(element):
#移除嵌入的图片
for paragraph in element.paragraphs:
for run in paragraph.runs:
drawing_elements = run._element.findall(qn('w:drawing'))
for drawing in drawing_elements:
drawing.getparent().remove(drawing)
pict_elements = run._element.findall(qn('w:pict'))
for pict in pict_elements:
pict.getparent().remove(pict)
def remove_text_from_element(element, text_to_remove=None):
# 移除aspose的标记此处为页脚处的
if text_to_remove == None:
text_to_remove = "Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd."
for paragraph in element.paragraphs:
if text_to_remove in paragraph.text:
# 清除特定文本
paragraph.clear()
for section in doc.sections:
header = section.header
footer = section.footer
remove_images_from_element(header)
remove_text_from_element(footer)
#删除第一段落
if doc.paragraphs:
first_paragraph = doc.paragraphs[0]
p = first_paragraph._element
p.getparent().remove(p)
def html2docx_aspose(html_path, docx_path):
doc = aw.Document(html_path)
doc.save(docx_path,aw.SaveFormat.DOCX)
doc = Document(docx_path)
remove_mark_from_doc(doc)
doc.save(docx_path)
print("word报告生成成功")
def html2pdf_pdfkit(html_path, pdf_path):
# 将HTML文件转换为PDF
options = {
'page-size': 'Letter',
'margin-top': '0.35in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'no-outline': None,
'enable-local-file-access': None
}
pdfkit.from_file(html_path, pdf_path, options=options)
print("pdf报告生成成功")
def html2excel_pandas(html_path, excel_path):
# 将HTML文件转换为EXCEL
tables = pd.read_html(html_path)
# 创建一个Excel工作簿
with pd.ExcelWriter(excel_path) as writer:
# 写入到Excel的不同工作表中
for i, df in enumerate(tables):
sheet_name = f'Sheet{i+1}'
df.to_excel(writer, sheet_name=sheet_name, index=False)
print("excel文件生成成功")
###############################备选方案
def html2docx_pypandoc(html_path, docx_path):
import pypandoc
pypandoc.convert_file(html_path, 'docx', outputfile=docx_path)
print("Word报告生成成功")
def html2docx_spire(html_path, docx_path):
from spire.doc import FileFormat,XHTMLValidationType,Document
# from spire.doc.common import *
document = Document()
document.LoadFromFile(html_path, FileFormat.Html, XHTMLValidationType.none)
document.SaveToFile(docx_path, FileFormat.Docx2016)
document.Close()