ReportGeneration/PythonReportGeneration/process/inference.py

import aspose.words as aw
import pdfkit
from docx import Document
from docx.oxml.ns import qn
import pandas as pd
# from docx.oxml import OxmlElement


def remove_mark_from_doc(doc):
    # 移除aspose库的标记
    def remove_images_from_element(element):
        #移除嵌入的图片
        for paragraph in element.paragraphs:
            for run in paragraph.runs:
                drawing_elements = run._element.findall(qn('w:drawing'))
                for drawing in drawing_elements:
                    drawing.getparent().remove(drawing)

                pict_elements = run._element.findall(qn('w:pict'))
                for pict in pict_elements:
                    pict.getparent().remove(pict)

    def remove_text_from_element(element, text_to_remove=None):
        # 移除aspose的标记（此处为页脚处的）
        if text_to_remove == None:
            text_to_remove = "Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd."

        for paragraph in element.paragraphs:
            if text_to_remove in paragraph.text:
                # 清除特定文本
                paragraph.clear()

    for section in doc.sections:
        header = section.header
        footer = section.footer

        remove_images_from_element(header)
        remove_text_from_element(footer)

    #删除第一段落
    if doc.paragraphs:
        first_paragraph = doc.paragraphs[0]
        p = first_paragraph._element
        p.getparent().remove(p)


def html2docx_aspose(html_path, docx_path):
    doc = aw.Document(html_path)
    doc.save(docx_path,aw.SaveFormat.DOCX)

    doc = Document(docx_path)
    remove_mark_from_doc(doc)
    doc.save(docx_path)
    print("word报告生成成功！")

def html2pdf_pdfkit(html_path, pdf_path):
    # 将HTML文件转换为PDF
    options = {
        'page-size': 'Letter',
        'margin-top': '0.35in',
        'margin-right': '0.75in',
        'margin-bottom': '0.75in',
        'margin-left': '0.75in',
        'encoding': "UTF-8",
        'no-outline': None,
        'enable-local-file-access': None
    }

    pdfkit.from_file(html_path, pdf_path, options=options)
    print("pdf报告生成成功！")


def html2excel_pandas(html_path, excel_path):
    # 将HTML文件转换为EXCEL
    tables = pd.read_html(html_path)

    # 创建一个Excel工作簿
    with pd.ExcelWriter(excel_path) as writer:
        # 写入到Excel的不同工作表中
        for i, df in enumerate(tables):
            sheet_name = f'Sheet{i+1}'
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    print("excel文件生成成功！")


###############################备选方案

def html2docx_pypandoc(html_path, docx_path):
    import pypandoc
    pypandoc.convert_file(html_path, 'docx', outputfile=docx_path)
    print("Word报告生成成功！")


def html2docx_spire(html_path, docx_path):
    from spire.doc import FileFormat,XHTMLValidationType,Document
    # from spire.doc.common import *
    document = Document()
    document.LoadFromFile(html_path, FileFormat.Html, XHTMLValidationType.none)
    document.SaveToFile(docx_path, FileFormat.Docx2016)
    document.Close()