diff --git a/input/base_template.html b/input/base_template.html index 4b35afc..7a50637 100644 --- a/input/base_template.html +++ b/input/base_template.html @@ -30,7 +30,7 @@ } nav a { float: left; - display: block; + /* display: block; */ color: white; text-align: center; padding: 14px 20px; diff --git a/input/report_template.html b/input/report_template.html index e56a479..6051280 100644 --- a/input/report_template.html +++ b/input/report_template.html @@ -5,7 +5,7 @@

章节划分

{% for section in sections %}
-

{{ section.title }}

+

{{ section.title }}

{{ section.content }}

{% endfor %} @@ -74,6 +74,7 @@

图片展示

{% for image in images %} +
{{ image.alt }}

{{ image.caption }}

diff --git a/output/generated_report.docx b/output/generated_report.docx index a705894..ede49da 100644 Binary files a/output/generated_report.docx and b/output/generated_report.docx differ diff --git a/output/generated_report.html b/output/generated_report.html index 97cf608..b5b6257 100644 --- a/output/generated_report.html +++ b/output/generated_report.html @@ -30,7 +30,7 @@ } nav a { float: left; - display: block; + /* display: block; */ color: white; text-align: center; padding: 14px 20px; @@ -124,12 +124,12 @@

章节划分

-

章节一

+

章节一

内容一

-

章节二

+

章节二

内容二

@@ -213,12 +213,14 @@

图片展示

+
图片1

这是一张示例图片1

+
图片2

这是一张示例图片2

diff --git a/output/generated_report.pdf b/output/generated_report.pdf index 84dea22..254563e 100644 Binary files a/output/generated_report.pdf and b/output/generated_report.pdf differ diff --git a/process/inference.py b/process/inference.py new file mode 100644 index 0000000..20945d9 --- /dev/null +++ b/process/inference.py @@ -0,0 +1,93 @@ +import aspose.words as aw +import pdfkit +from docx import Document +from docx.oxml.ns import qn +# from docx.oxml import OxmlElement + + +def remove_mark_from_doc(doc): + # 移除aspose库的标记 + def remove_images_from_element(element): + #移除嵌入的图片 + for paragraph in element.paragraphs: + for run in paragraph.runs: + drawing_elements = run._element.findall(qn('w:drawing')) + for drawing in drawing_elements: + drawing.getparent().remove(drawing) + + pict_elements = run._element.findall(qn('w:pict')) + for pict in pict_elements: + pict.getparent().remove(pict) + + def remove_text_from_element(element, text_to_remove=None): + # 移除aspose的标记(此处为页脚处的) + if text_to_remove == None: + text_to_remove = "Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd." + + for paragraph in element.paragraphs: + if text_to_remove in paragraph.text: + # 清除特定文本 + paragraph.clear() + + for section in doc.sections: + header = section.header + footer = section.footer + + remove_images_from_element(header) + remove_text_from_element(footer) + + #删除第一段落 + if doc.paragraphs: + first_paragraph = doc.paragraphs[0] + p = first_paragraph._element + p.getparent().remove(p) + + +def html2docx_aspose(html_path, docx_path): + doc = aw.Document(html_path) + doc.save(docx_path,aw.SaveFormat.DOCX) + + doc = Document(docx_path) + remove_mark_from_doc(doc) + doc.save(docx_path) + print("word报告生成成功!") + +def html2pdf_pdfkit(html_path, pdf_path): + # 将HTML文件转换为PDF + options = { + 'page-size': 'Letter', + 'margin-top': '0.35in', + 'margin-right': '0.75in', + 'margin-bottom': '0.75in', + 'margin-left': '0.75in', + 'encoding': "UTF-8", + 'no-outline': None, + 'enable-local-file-access': None + } + + pdfkit.from_file(html_path, pdf_path, options=options) + print("pdf报告生成成功!") + + + + + + + + + +###############################备选方案 + +def html2docx_pypandoc(html_path, docx_path): + import pypandoc + pypandoc.convert_file(html_path, 'docx', outputfile=docx_path) + print("Word报告生成成功!") + + +def html2docx_spire(html_path, docx_path): + from spire.doc import FileFormat,XHTMLValidationType,Document + # from spire.doc.common import * + document = Document() + document.LoadFromFile(html_path, FileFormat.Html, XHTMLValidationType.none) + document.SaveToFile(docx_path, FileFormat.Docx2016) + document.Close() \ No newline at end of file diff --git a/report_generation.py b/report_generation.py index 2553cc1..673ce05 100644 --- a/report_generation.py +++ b/report_generation.py @@ -1,7 +1,6 @@ from jinja2 import Environment, FileSystemLoader -import pdfkit import config -import pypandoc +from process.inference import html2docx_aspose, html2pdf_pdfkit def render_html(template, output_file_path): @@ -46,42 +45,15 @@ def render_html(template, output_file_path): f.write(report_html) -def html2pdf(html_path, pdf_path): - # 将HTML文件转换为PDF - options = { - 'page-size': 'Letter', - 'margin-top': '0.35in', - 'margin-right': '0.75in', - 'margin-bottom': '0.75in', - 'margin-left': '0.75in', - 'encoding': "UTF-8", - 'no-outline': None, - 'enable-local-file-access': None - } - - pdfkit.from_file(html_path, pdf_path, options=options) - print("pdf报告生成成功!") -def html2docx(html_path, docx_path): - # 将HTML文件转换为WORD - pypandoc.convert_file(html_path, 'docx', outputfile=docx_path) - print("Word报告生成成功!") - -def html2docx_spire(html_path, docx_path): - from spire.doc import FileFormat,XHTMLValidationType,Document - # from spire.doc.common import * - document = Document() - document.LoadFromFile(html_path, FileFormat.Html, XHTMLValidationType.none) - document.SaveToFile(docx_path, FileFormat.Docx2016) - document.Close() def main(): env = Environment(loader=FileSystemLoader(config.template_dir)) template = env.get_template("report_template.html") render_html(template, config.output_html_path) - html2pdf(config.output_html_path, config.output_pdf_path) - html2docx(config.output_html_path, config.output_docx_path) + html2pdf_pdfkit(config.output_html_path, config.output_pdf_path) + html2docx_aspose(config.output_html_path, config.output_docx_path) if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 805702e..bdd9910 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,17 @@ pandas==2.2.2 opencv-python==4.10.0.84 openpyxl==3.1.4 Jinja2==3.1.4 +aspose-words==24.7.0 -#Spire.Doc==12.7.1 \ No newline at end of file + + + + + + +# Spire.Doc==12.7.1 + +# html2docx==1.6.0 + +# sudo dpkg -i libssl1.0.0_1.0.2g-1ubuntu4.20_amd64.deb +# wget http://security.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.0.0_1.0.2g-1ubuntu4.20_amd64.deb diff --git a/test_files/temp.html2docx.py b/test_files/temp.html2docx.py new file mode 100644 index 0000000..8bfa2c8 --- /dev/null +++ b/test_files/temp.html2docx.py @@ -0,0 +1,12 @@ +from html2docx import html2docx + +with open("output/generated_report.html") as fp: + html = fp.read() + +# html2docx() returns an io.BytesIO() object. The HTML must be valid. +buf = html2docx(html, title="My Document") + +with open("my.docx", "wb") as fp: + fp.write(buf.getvalue()) + + diff --git a/tools/libssl1.0.0_1.0.2g-1ubuntu4.20_amd64.deb b/tools/libssl1.0.0_1.0.2g-1ubuntu4.20_amd64.deb new file mode 100644 index 0000000..6509c02 Binary files /dev/null and b/tools/libssl1.0.0_1.0.2g-1ubuntu4.20_amd64.deb differ