如何使用 XPATH 或 Apache POI 从 XML 中过滤水印文本?
Posted
技术标签:
【中文标题】如何使用 XPATH 或 Apache POI 从 XML 中过滤水印文本?【英文标题】:How can I Filter watermark text from XML using XPATH or Apache POI? 【发布时间】:2022-01-01 23:06:35 【问题描述】:这些行按照 XML 打印
private File file; // path to local docx file
private POITextExtractor textExtractor = ExtractorFactory.createExtractor(file);
XWPFHeader defaultHeader = d.getHeaderFooterPolicy().getDefaultHeader();
String raw_xml = defaultHeader._getHdrFtr().selectPath("*")[0].toString()
<?xml version="1.0" encoding="UTF-8"?>
<xml-fragment xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">
<w:sdtPr>
<w:id w:val="-1126775779" />
<w:docPartObj>
<w:docPartGallery w:val="Watermarks" />
<w:docPartUnique />
</w:docPartObj>
</w:sdtPr>
<w:sdtContent>
<w:p w14:paraId="41319DAD" w14:textId="4534348F" w:rsidR="006868D8" w:rsidRDefault="006868D8">
<w:pPr>
<w:pStyle w:val="Header" />
</w:pPr>
<w:r>
<w:rPr>
<w:noProof />
</w:rPr>
<w:pict w14:anchorId="63C3AA3C">
<v:shapetype id="_x0000_t136" coordsize="21600,21600" o:spt="136" adj="10800" path="m@7,l@8,m@5,21600l@6,21600e">
<v:formulas>
<v:f eqn="sum #0 0 10800" />
<v:f eqn="prod #0 2 1" />
<v:f eqn="sum 21600 0 @1" />
<v:f eqn="sum 0 0 @2" />
<v:f eqn="sum 21600 0 @3" />
<v:f eqn="if @0 @3 0" />
<v:f eqn="if @0 21600 @1" />
<v:f eqn="if @0 0 @2" />
<v:f eqn="if @0 @4 21600" />
<v:f eqn="mid @5 @6" />
<v:f eqn="mid @8 @5" />
<v:f eqn="mid @7 @8" />
<v:f eqn="mid @6 @7" />
<v:f eqn="sum @6 0 @5" />
</v:formulas>
<v:path textpathok="t" o:connecttype="custom" o:connectlocs="@9,0;@10,10800;@11,21600;@12,10800" o:connectangles="270,180,90,0" />
<v:textpath on="t" fitshape="t" />
<v:handles>
<v:h position="#0,bottomRight" xrange="6629,14971" />
</v:handles>
<o:lock v:ext="edit" text="t" shapetype="t" />
</v:shapetype>
<v:shape id="PowerPlusWaterMarkObject357476642" o:spid="_x0000_s1025" type="#_x0000_t136" style="position:absolute;margin-left:0;margin-top:0;width:527.85pt;height:131.95pt;rotation:315;z-index:-251657216;mso-position-horizontal:center;mso-position-horizontal-relative:margin;mso-position-vertical:center;mso-position-vertical-relative:margin" o:allowincell="f" fillcolor="silver" stroked="f">
<v:fill opacity=".5" />
<v:textpath style="font-family:"Calibri";font-size:1pt" string="CONFIDENTIAL" />
<w10:wrap anchorx="margin" anchory="margin" />
</v:shape>
</w:pict>
</w:r>
</w:p>
</w:sdtContent>
</xml-fragment>
以下 XPATH 显示 Confidential
string(//v:shape[contains(@id,'PowerPlusWaterMarkObject')]/v:textpath/@string)
我如何使用这个 XPATH 来获取 Watermark 的确切值,或者可能是任何其他方式来获取 Apache POI 中的水印
【问题讨论】:
【参考方案1】:您已经找到org.apache.xmlbeans.XmlObject.selectPath
。这允许通过 XPATH 选择 XmlObject
s。问题是使用的 XPATH 可能的复杂性受到 JRE 可以使用的 XPATH 评估器类型的限制。
对我来说(Windows 10、JRE 12.0.2)它需要Saxon-HE-10.6.jar
在类路径中才能启用谓词过滤。否则路径$this//v:shape[@id]
会导致找不到类异常:java.lang.ClassNotFoundException: net.sf.saxon.sxpath.XPathStaticContext
。
完整示例:
import java.io.FileInputStream;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHeader;
import java.util.StringJoiner;
public class ReadWordWatermarkXWPFXPATH
static String getWatermarkText(XWPFDocument document)
StringJoiner stringJoiner = new StringJoiner(" ");
for (XWPFHeader header : document.getHeaderList())
org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr ctHdrFtr = header._getHdrFtr();
String declareNameSpaces = "declare namespace v='urn:schemas-microsoft-com:vml'; ";
org.apache.xmlbeans.XmlObject[] selectedObjects = ctHdrFtr.selectPath(
declareNameSpaces
+ "$this//v:shape[contains(@id,'PowerPlusWaterMarkObject')]/v:textpath/@string");
for (org.apache.xmlbeans.XmlObject object : selectedObjects)
if (object instanceof org.apache.xmlbeans.XmlString)
org.apache.xmlbeans.XmlString xmlString = (org.apache.xmlbeans.XmlString)object;
stringJoiner.add(xmlString.getStringValue());
return stringJoiner.toString();
public static void main(String[] args) throws Exception
XWPFDocument document = new XWPFDocument(new FileInputStream("./WordDocument.docx"));
String watermarkText = getWatermarkText(document);
System.out.println(watermarkText);
$this//v:shape
等简单路径无需在类路径中添加额外的 5 MB Saxon-HE-10.6.jar
即可。
知道了这一点,我们可以这样做:
import java.io.FileInputStream;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHeader;
import java.util.StringJoiner;
public class ReadWordWatermarkXWPF
static String getWatermarkText(XWPFDocument document)
StringJoiner stringJoiner = new StringJoiner(" ");
for (XWPFHeader header : document.getHeaderList())
org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr ctHdrFtr = header._getHdrFtr();
String declareNameSpaces = "declare namespace v='urn:schemas-microsoft-com:vml'; ";
org.apache.xmlbeans.XmlObject[] selectedObjects = ctHdrFtr.selectPath(
declareNameSpaces
+ "$this//v:shape");
for (org.apache.xmlbeans.XmlObject object : selectedObjects)
if (object instanceof com.microsoft.schemas.vml.CTShape)
com.microsoft.schemas.vml.CTShape shape = (com.microsoft.schemas.vml.CTShape)object;
if (shape.getId() != null)
String id = shape.getId();
if (id.contains("PowerPlusWaterMarkObject"))
for (com.microsoft.schemas.vml.CTTextPath textPath : shape.getTextpathList())
stringJoiner.add(textPath.getString());
return stringJoiner.toString();
public static void main(String[] args) throws Exception
XWPFDocument document = new XWPFDocument(new FileInputStream("./WordDocument.docx"));
String watermarkText = getWatermarkText(document);
System.out.println(watermarkText);
【讨论】:
【参考方案2】:使用 XPATH 解决了我的问题
*//v:shape/v:textpath/@string
【讨论】:
以上是关于如何使用 XPATH 或 Apache POI 从 XML 中过滤水印文本?的主要内容,如果未能解决你的问题,请参考以下文章
Apache Poi Word表,有关Alt Text的信息