aspose-words生成文档，与html格式对应

2024-07-31 10:16:39 +08:00 · 2024-07-31 10:16:39 +08:00 · 52a6d03899
parent 95c7522a1e
commit 52a6d03899
10 changed files with 129 additions and 37 deletions
--- a/input/base_template.html
+++ b/input/base_template.html
@ -30,7 +30,7 @@
        }
        nav a {
            float: left;
-            display: block;
+            /* display: block; */
            color: white;
            text-align: center;
            padding: 14px 20px;
--- a/input/report_template.html
+++ b/input/report_template.html
@ -5,7 +5,7 @@
        <h2>章节划分</h2>
        {% for section in sections %}
            <div class="section">
-                <h2>{{ section.title }}</h2>
+                <h3>{{ section.title }}</h3>
                <p>{{ section.content }}</p>
            </div>
        {% endfor %}
@ -74,6 +74,7 @@
        <h2>图片展示</h2>
        {% for image in images %}
            <!-- <div class="image-container"> -->
+                <!-- <img src="{{ image.src }}" alt="{{ image.alt }}"> -->
            <div style="text-align: center; margin-bottom: 10px;">
                <img src="{{ image.src }}" alt="{{ image.alt }}" style="width: 200px; height: auto;">
                <p>{{ image.caption }}</p>
--- a/output/generated_report.docx
+++ b/output/generated_report.docx
--- a/output/generated_report.html
+++ b/output/generated_report.html
@ -30,7 +30,7 @@
        }
        nav a {
            float: left;
-            display: block;
+            /* display: block; */
            color: white;
            text-align: center;
            padding: 14px 20px;
@ -124,12 +124,12 @@
        <h2>章节划分</h2>
        
            <div class="section">
-                <h2>章节一</h2>
+                <h3>章节一</h3>
                <p>内容一</p>
            </div>
        
            <div class="section">
-                <h2>章节二</h2>
+                <h3>章节二</h3>
                <p>内容二</p>
            </div>
        
@ -213,12 +213,14 @@
        <h2>图片展示</h2>
        
            <!-- <div class="image-container"> -->
+                <!-- <img src="/home/dengjinlai/ReportGeneration/ReportGeneration/input/image.png" alt="图片1"> -->
            <div style="text-align: center; margin-bottom: 10px;">
                <img src="/home/dengjinlai/ReportGeneration/ReportGeneration/input/image.png" alt="图片1" style="width: 200px; height: auto;">
                <p>这是一张示例图片1</p>
            </div>
        
            <!-- <div class="image-container"> -->
+                <!-- <img src="/home/dengjinlai/ReportGeneration/ReportGeneration/input/image.png" alt="图片2"> -->
            <div style="text-align: center; margin-bottom: 10px;">
                <img src="/home/dengjinlai/ReportGeneration/ReportGeneration/input/image.png" alt="图片2" style="width: 200px; height: auto;">
                <p>这是一张示例图片2</p>
--- a/output/generated_report.pdf
+++ b/output/generated_report.pdf
--- a/process/inference.py
+++ b/process/inference.py
@ -0,0 +1,93 @@
+import aspose.words as aw
+import pdfkit
+from docx import Document
+from docx.oxml.ns import qn
+# from docx.oxml import OxmlElement
+
+
+def remove_mark_from_doc(doc):
+    # 移除aspose库的标记
+    def remove_images_from_element(element):
+        #移除嵌入的图片
+        for paragraph in element.paragraphs:
+            for run in paragraph.runs:
+                drawing_elements = run._element.findall(qn('w:drawing'))
+                for drawing in drawing_elements:
+                    drawing.getparent().remove(drawing)
+                    
+                pict_elements = run._element.findall(qn('w:pict'))
+                for pict in pict_elements:
+                    pict.getparent().remove(pict)
+
+    def remove_text_from_element(element, text_to_remove=None):
+        # 移除aspose的标记（此处为页脚处的）
+        if text_to_remove == None:
+            text_to_remove = "Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd."
+
+        for paragraph in element.paragraphs:
+            if text_to_remove in paragraph.text:
+                # 清除特定文本
+                paragraph.clear()
+
+    for section in doc.sections:
+        header = section.header
+        footer = section.footer
+
+        remove_images_from_element(header)
+        remove_text_from_element(footer)
+
+    #删除第一段落
+    if doc.paragraphs:
+        first_paragraph = doc.paragraphs[0]
+        p = first_paragraph._element
+        p.getparent().remove(p)
+
+
+def html2docx_aspose(html_path, docx_path):
+    doc = aw.Document(html_path)
+    doc.save(docx_path,aw.SaveFormat.DOCX)
+
+    doc = Document(docx_path)
+    remove_mark_from_doc(doc)
+    doc.save(docx_path)
+    print("word报告生成成功！")
+
+def html2pdf_pdfkit(html_path, pdf_path):
+    # 将HTML文件转换为PDF
+    options = {
+        'page-size': 'Letter',
+        'margin-top': '0.35in',
+        'margin-right': '0.75in',
+        'margin-bottom': '0.75in',
+        'margin-left': '0.75in',
+        'encoding': "UTF-8",
+        'no-outline': None,
+        'enable-local-file-access': None
+    }
+    
+    pdfkit.from_file(html_path, pdf_path, options=options)
+    print("pdf报告生成成功！")
+
+
+
+
+
+
+
+
+
+###############################备选方案
+
+def html2docx_pypandoc(html_path, docx_path):
+    import pypandoc
+    pypandoc.convert_file(html_path, 'docx', outputfile=docx_path)
+    print("Word报告生成成功！")
+
+
+def html2docx_spire(html_path, docx_path):
+    from spire.doc import FileFormat,XHTMLValidationType,Document
+    # from spire.doc.common import *
+    document = Document()
+    document.LoadFromFile(html_path, FileFormat.Html, XHTMLValidationType.none)
+    document.SaveToFile(docx_path, FileFormat.Docx2016)
+    document.Close()
--- a/report_generation.py
+++ b/report_generation.py
@ -1,7 +1,6 @@
 from jinja2 import Environment, FileSystemLoader
-import pdfkit
 import config
-import pypandoc
+from process.inference import html2docx_aspose, html2pdf_pdfkit


 def render_html(template, output_file_path):
@ -46,42 +45,15 @@ def render_html(template, output_file_path):
        f.write(report_html)


-def html2pdf(html_path, pdf_path):
-    # 将HTML文件转换为PDF
-    options = {
-        'page-size': 'Letter',
-        'margin-top': '0.35in',
-        'margin-right': '0.75in',
-        'margin-bottom': '0.75in',
-        'margin-left': '0.75in',
-        'encoding': "UTF-8",
-        'no-outline': None,
-        'enable-local-file-access': None
-    }

-    pdfkit.from_file(html_path, pdf_path, options=options)
-    print("pdf报告生成成功！")
-
-def html2docx(html_path, docx_path):
-    # 将HTML文件转换为WORD
-    pypandoc.convert_file(html_path, 'docx', outputfile=docx_path)
-    print("Word报告生成成功！")
-
-def html2docx_spire(html_path, docx_path):
-    from spire.doc import FileFormat,XHTMLValidationType,Document
-    # from spire.doc.common import *
-    document = Document()
-    document.LoadFromFile(html_path, FileFormat.Html, XHTMLValidationType.none)
-    document.SaveToFile(docx_path, FileFormat.Docx2016)
-    document.Close()

 def main():
    env = Environment(loader=FileSystemLoader(config.template_dir))
    template = env.get_template("report_template.html")

    render_html(template, config.output_html_path)
-    html2pdf(config.output_html_path, config.output_pdf_path)
-    html2docx(config.output_html_path, config.output_docx_path)
+    html2pdf_pdfkit(config.output_html_path, config.output_pdf_path)
+    html2docx_aspose(config.output_html_path, config.output_docx_path)

          
 if __name__ == "__main__":
--- a/requirements.txt
+++ b/requirements.txt
@ -5,5 +5,17 @@ pandas==2.2.2
 opencv-python==4.10.0.84
 openpyxl==3.1.4
 Jinja2==3.1.4
+aspose-words==24.7.0
+
+
+
+
+
+

 # Spire.Doc==12.7.1
+
+# html2docx==1.6.0
+
+# sudo dpkg -i libssl1.0.0_1.0.2g-1ubuntu4.20_amd64.deb
+# wget http://security.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.0.0_1.0.2g-1ubuntu4.20_amd64.deb
--- a/test_files/temp.html2docx.py
+++ b/test_files/temp.html2docx.py
@ -0,0 +1,12 @@
+from html2docx import html2docx
+
+with open("output/generated_report.html") as fp:
+    html = fp.read()
+
+# html2docx() returns an io.BytesIO() object. The HTML must be valid.
+buf = html2docx(html, title="My Document")
+
+with open("my.docx", "wb") as fp:
+    fp.write(buf.getvalue())
+
+
--- a/tools/libssl1.0.0_1.0.2g-1ubuntu4.20_amd64.deb
+++ b/tools/libssl1.0.0_1.0.2g-1ubuntu4.20_amd64.deb