impl header reader with rust (#376)

* fix: document_rels reader to keep multi rels

* feat: Add header reader

* faet: assign header to sectionProperty

* spec: update snaps

* feat: read titlePg

* spec: update snaps
main
bokuweb 2021-11-30 02:36:04 +09:00 committed by GitHub
parent 3342269a45
commit 423e075eb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 391 additions and 92 deletions

View File

@ -88,7 +88,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: 1.45.1
toolchain: 1.56.1
override: true
- run: rustup component add clippy
- uses: actions-rs/cargo@v1

View File

@ -4,7 +4,7 @@ use std::fs::File;
use std::io::{Read, Write};
pub fn main() {
let mut file = File::open("./spacing.docx").unwrap();
let mut file = File::open("./header.docx").unwrap();
let mut buf = vec![];
file.read_to_end(&mut buf).unwrap();

View File

@ -97,8 +97,10 @@ impl Delete {
impl HistoryId for Delete {}
impl BuildXML for Delete {
#[allow(clippy::needless_borrow)]
fn build(&self) -> Vec<u8> {
let mut b = XMLBuilder::new().open_delete(&self.generate(), &self.author, &self.date);
let id = self.generate();
let mut b = XMLBuilder::new().open_delete(&id, &self.author, &self.date);
for c in &self.children {
match c {
DeleteChild::Run(t) => b = b.add_child(t),

View File

@ -6,8 +6,8 @@ use serde::Serialize;
#[derive(Debug, Clone, PartialEq, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct HeaderReference {
header_type: String,
id: String,
pub header_type: String,
pub id: String,
}
impl Default for HeaderReference {

View File

@ -113,9 +113,10 @@ impl Insert {
}
pub fn add_comment_start(mut self, comment: Comment) -> Self {
self.children.push(InsertChild::CommentStart(Box::new(
CommentRangeStart::new(comment),
)));
self.children
.push(InsertChild::CommentStart(Box::new(CommentRangeStart::new(
comment,
))));
self
}
@ -139,6 +140,7 @@ impl Insert {
impl HistoryId for Insert {}
impl BuildXML for Insert {
#[allow(clippy::needless_borrow)]
fn build(&self) -> Vec<u8> {
XMLBuilder::new()
.open_insert(&self.generate(), &self.author, &self.date)

View File

@ -1,4 +1,5 @@
use std::collections::BTreeMap;
use std::collections::HashSet;
use std::io::{Cursor, Read};
use std::path::*;
use std::str::FromStr;
@ -8,14 +9,18 @@ use xml::reader::{EventReader, XmlEvent};
use super::errors::*;
use super::*;
pub type RId = String;
#[derive(Debug, Clone, PartialEq)]
pub struct ReadDocumentRels {
rels: BTreeMap<String, PathBuf>,
rels: BTreeMap<String, HashSet<(RId, PathBuf)>>,
}
impl ReadDocumentRels {
pub fn find_target_path(&self, target: &str) -> Option<PathBuf> {
self.rels.get(target).cloned()
pub fn find_target_path(&self, target: &str) -> Option<Vec<(RId, PathBuf)>> {
self.rels
.get(target)
.map(|s| s.clone().into_iter().collect())
}
}
@ -51,6 +56,7 @@ fn read_rels_xml<R: Read>(
let e = XMLElement::from_str(&name.local_name).unwrap();
if let XMLElement::Relationship = e {
let mut rel_type = "".to_owned();
let mut rid = "".to_owned();
let mut target = PathBuf::default();
for a in attributes {
let local_name = &a.name.local_name;
@ -58,9 +64,18 @@ fn read_rels_xml<R: Read>(
rel_type = a.value.to_owned();
} else if local_name == "Target" {
target = Path::new(dir.as_ref()).join(a.value);
} else if local_name == "Id" {
rid = a.value.to_owned();
}
}
rels.rels.insert(rel_type, target);
let current = rels.rels.remove(&rel_type);
if let Some(mut paths) = current {
paths.insert((rid, target));
rels.rels.insert(rel_type, paths);
} else {
let s: HashSet<(RId, PathBuf)> = vec![(rid, target)].into_iter().collect();
rels.rels.insert(rel_type, s);
}
continue;
}
}

View File

@ -0,0 +1,71 @@
use std::io::Read;
use std::str::FromStr;
use crate::reader::*;
use xml::reader::{EventReader, XmlEvent};
use super::{Paragraph, Table};
impl FromXML for Header {
fn from_xml<R: Read>(reader: R) -> Result<Self, ReaderError> {
let mut parser = EventReader::new(reader);
let mut header = Self::default();
loop {
let e = parser.next();
match e {
Ok(XmlEvent::StartElement {
attributes, name, ..
}) => {
let e = XMLElement::from_str(&name.local_name).unwrap();
match e {
XMLElement::Paragraph => {
let p = Paragraph::read(&mut parser, &attributes)?;
header = header.add_paragraph(p);
continue;
}
XMLElement::Table => {
let t = Table::read(&mut parser, &attributes)?;
header = header.add_table(t);
continue;
}
_ => {}
}
}
Ok(XmlEvent::EndDocument) => break,
Err(_) => return Err(ReaderError::XMLReadError),
_ => {}
}
}
Ok(header)
}
}
#[test]
fn test_header_from_xml() {
let xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:hdr xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:w10="urn:schemas-microsoft-com:office:word"
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup"
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing"
xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" mc:Ignorable="w14 wp14">
<w:p w14:paraId="12345678">
<w:pPr>
<w:rPr />
</w:pPr>
<w:r>
<w:rPr />
<w:t xml:space="preserve">Hello Header</w:t>
</w:r>
</w:p>
</w:hdr>"#;
let h = Header::from_xml(xml.as_bytes()).unwrap();
let expected =
Header::new().add_paragraph(Paragraph::new().add_run(Run::new().add_text("Hello Header")));
assert_eq!(h, expected)
}

View File

@ -17,6 +17,7 @@ mod document_rels;
mod drawing;
mod errors;
mod from_xml;
mod header;
mod ignore;
mod insert;
mod level;
@ -50,7 +51,7 @@ mod wps_shape;
mod wps_text_box;
mod xml_element;
use std::io::Cursor;
use std::{collections::HashMap, io::Cursor};
use crate::documents::*;
@ -61,6 +62,7 @@ pub use from_xml::*;
pub use mc_fallback::*;
pub use read_zip::*;
pub use xml_element::*;
use zip::ZipArchive;
// 2006
const DOC_RELATIONSHIP_TYPE: &str =
@ -77,10 +79,36 @@ const COMMENTS_TYPE: &str =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments";
const WEB_SETTINGS_TYPE: &str =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings";
const HEADER_TYPE: &str =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/header";
// 2011
const COMMENTS_EXTENDED_TYPE: &str =
"http://schemas.microsoft.com/office/2011/relationships/commentsExtended";
fn read_headers(
rels: &ReadDocumentRels,
archive: &mut ZipArchive<Cursor<&[u8]>>,
) -> HashMap<RId, Header> {
let header_paths = rels.find_target_path(HEADER_TYPE);
let headers: HashMap<RId, Header> = header_paths
.unwrap_or_default()
.into_iter()
.filter_map(|(rid, path)| {
let data = read_zip(archive, path.to_str().expect("should have header path."));
if let Ok(d) = data {
if let Ok(h) = Header::from_xml(&d[..]) {
Some((rid, h))
} else {
None
}
} else {
None
}
})
.collect();
headers
}
pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
let mut docx = Docx::new();
let cur = Cursor::new(buf);
@ -121,9 +149,12 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
let rels = read_document_rels(&mut archive, &document_path)?;
let headers = read_headers(&rels, &mut archive);
// Read commentsExtended
let comments_extended_path = rels.find_target_path(COMMENTS_EXTENDED_TYPE);
let comments_extended = if let Some(comments_extended_path) = comments_extended_path {
if let Some((_, comments_extended_path)) = comments_extended_path.get(0) {
let data = read_zip(
&mut archive,
comments_extended_path
@ -137,11 +168,15 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
}
} else {
CommentsExtended::default()
}
} else {
CommentsExtended::default()
};
// Read comments
let comments_path = rels.find_target_path(COMMENTS_TYPE);
let comments = if let Some(comments_path) = comments_path {
let comments = if let Some(paths) = comments_path {
if let Some((_, comments_path)) = paths.get(0) {
let data = read_zip(
&mut archive,
comments_path.to_str().expect("should have comments."),
@ -185,6 +220,9 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
}
} else {
Comments::default()
}
} else {
Comments::default()
};
let document = {
@ -193,6 +231,28 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
};
docx = docx.document(document);
// assign headers
if let Some(h) = docx.document.section_property.header_reference.clone() {
if let Some(header) = headers.get(&h.id) {
docx.document = docx.document.header(header.clone(), &h.id);
}
}
if let Some(ref h) = docx
.document
.section_property
.first_header_reference
.clone()
{
if let Some(header) = headers.get(&h.id) {
docx.document = docx.document.first_header(header.clone(), &h.id);
}
}
if let Some(ref h) = docx.document.section_property.even_header_reference.clone() {
if let Some(header) = headers.get(&h.id) {
docx.document = docx.document.even_header(header.clone(), &h.id);
}
}
// store comments to paragraphs.
if !comments.inner().is_empty() {
docx.store_comments(comments.inner());
@ -203,7 +263,8 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
// Read document relationships
// Read styles
let style_path = rels.find_target_path(STYLE_RELATIONSHIP_TYPE);
if let Some(style_path) = style_path {
if let Some(paths) = style_path {
if let Some((_, style_path)) = paths.get(0) {
let data = read_zip(
&mut archive,
style_path.to_str().expect("should have styles"),
@ -211,10 +272,12 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
let styles = Styles::from_xml(&data[..])?;
docx = docx.styles(styles);
}
}
// Read numberings
let num_path = rels.find_target_path(NUMBERING_RELATIONSHIP_TYPE);
if let Some(num_path) = num_path {
if let Some(paths) = num_path {
if let Some((_, num_path)) = paths.get(0) {
let data = read_zip(
&mut archive,
num_path.to_str().expect("should have numberings"),
@ -222,10 +285,12 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
let nums = Numberings::from_xml(&data[..])?;
docx = docx.numberings(nums);
}
}
// Read settings
let settings_path = rels.find_target_path(SETTINGS_TYPE);
if let Some(settings_path) = settings_path {
if let Some(paths) = settings_path {
if let Some((_, settings_path)) = paths.get(0) {
let data = read_zip(
&mut archive,
settings_path.to_str().expect("should have settings"),
@ -233,11 +298,12 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
let settings = Settings::from_xml(&data[..])?;
docx = docx.settings(settings);
}
}
// Read web settings
let web_settings_path = rels.find_target_path(WEB_SETTINGS_TYPE);
dbg!(&web_settings_path);
if let Some(web_settings_path) = web_settings_path {
if let Some(paths) = web_settings_path {
if let Some((_, web_settings_path)) = paths.get(0) {
let data = read_zip(
&mut archive,
web_settings_path
@ -247,6 +313,7 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
let web_settings = WebSettings::from_xml(&data[..])?;
docx = docx.web_settings(web_settings);
}
}
Ok(docx)
}

View File

@ -52,13 +52,14 @@ mod tests {
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml" />
</Relationships>"#;
let c = Rels::from_xml(xml.as_bytes()).unwrap();
let mut rels = Vec::new();
rels.push((
let rels =
vec![
(
"http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties"
.to_owned(),
"rId1".to_owned(),
"docProps/core.xml".to_owned(),
));
)];
assert_eq!(Rels { rels }, c);
}
}

View File

@ -57,6 +57,24 @@ fn read_page_margin(
Ok(margin)
}
fn read_header_reference(attributes: &[OwnedAttribute]) -> Result<(String, String), ReaderError> {
let mut rid = "".to_owned();
let mut header_type = "default".to_owned();
for a in attributes {
let local_name = &a.name.local_name;
match local_name.as_str() {
"type" => {
header_type = a.value.to_owned();
}
"id" => {
rid = a.value.to_owned();
}
_ => {}
}
}
Ok((rid, header_type))
}
impl ElementReader for SectionProperty {
fn read<R: Read>(
r: &mut EventReader<R>,
@ -84,6 +102,26 @@ impl ElementReader for SectionProperty {
sp = sp.doc_grid(doc_grid);
}
}
XMLElement::HeaderReference => {
if let Ok((rid, header_type)) = read_header_reference(&attributes) {
match header_type.as_str() {
"default" => {
sp.header_reference =
Some(HeaderReference::new(header_type, rid));
}
"first" => {
sp.first_header_reference =
Some(HeaderReference::new(header_type, rid));
}
"even" => {
sp.even_header_reference =
Some(HeaderReference::new(header_type, rid));
}
_ => {}
}
}
}
XMLElement::TitlePg => sp = sp.title_pg(),
_ => {}
}
}

View File

@ -139,6 +139,9 @@ pub enum XMLElement {
PageSize,
PageMargin,
WebSettings,
HeaderReference,
TitlePg,
EvenAndOddHeaders,
Unsupported,
}
@ -335,6 +338,9 @@ impl FromStr for XMLElement {
"keepLines" => Ok(XMLElement::KeepLines),
"pageBreakBefore" => Ok(XMLElement::PageBreakBefore),
"windowControl" => Ok(XMLElement::WindowControl),
"headerReference" => Ok(XMLElement::HeaderReference),
"titlePg" => Ok(XMLElement::TitlePg),
"evenAndOddHeaders" => Ok(XMLElement::EvenAndOddHeaders),
_ => Ok(XMLElement::Unsupported),
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1695,6 +1695,14 @@ Object {
"gridType": "lines",
"linePitch": 360,
},
"header": Object {
"children": Array [],
"hasNumbering": false,
},
"headerReference": Object {
"headerType": "default",
"id": "rId4",
},
"pageMargin": Object {
"bottom": 1701,
"footer": 992,
@ -3815,6 +3823,14 @@ Object {
"gridType": "lines",
"linePitch": 360,
},
"header": Object {
"children": Array [],
"hasNumbering": false,
},
"headerReference": Object {
"headerType": "default",
"id": "rId4",
},
"pageMargin": Object {
"bottom": 1701,
"footer": 992,
@ -7225,6 +7241,14 @@ Object {
"gridType": "lines",
"linePitch": 360,
},
"header": Object {
"children": Array [],
"hasNumbering": false,
},
"headerReference": Object {
"headerType": "default",
"id": "rId4",
},
"pageMargin": Object {
"bottom": 1701,
"footer": 992,
@ -12491,6 +12515,79 @@ Object {
"gridType": "lines",
"linePitch": 360,
},
"header": Object {
"children": Array [
Object {
"data": Object {
"children": Array [
Object {
"data": Object {
"children": Array [],
"runProperty": Object {
"bold": null,
"boldCs": null,
"characterSpacing": null,
"color": null,
"del": null,
"fonts": null,
"highlight": null,
"ins": null,
"italic": null,
"italicCs": null,
"sz": null,
"szCs": null,
"textBorder": null,
"underline": null,
"vanish": null,
"vertAlign": null,
},
},
"type": "run",
},
],
"hasNumbering": false,
"id": "0C07C25B",
"property": Object {
"alignment": null,
"divId": null,
"indent": null,
"keepLines": false,
"keepNext": false,
"lineSpacing": null,
"numberingProperty": null,
"outlineLvl": null,
"pageBreakBefore": false,
"runProperty": Object {
"bold": null,
"boldCs": null,
"characterSpacing": null,
"color": null,
"del": null,
"fonts": null,
"highlight": null,
"ins": null,
"italic": null,
"italicCs": null,
"sz": null,
"szCs": null,
"textBorder": null,
"underline": null,
"vanish": null,
"vertAlign": null,
},
"style": null,
"windowControl": false,
},
},
"type": "paragraph",
},
],
"hasNumbering": false,
},
"headerReference": Object {
"headerType": "default",
"id": "rId11",
},
"pageMargin": Object {
"bottom": 1701,
"footer": 992,