From 8224ff066337f7ddfa11a0f9d264171778b680f6 Mon Sep 17 00:00:00 2001 From: bokuweb Date: Sat, 10 Feb 2024 10:59:35 +0900 Subject: [PATCH] Read header and footer image (#679) * feat: read header images * feat: read footer image * update snaps --- docx-core/src/reader/document_rels.rs | 87 +------------------ docx-core/src/reader/errors.rs | 2 + docx-core/src/reader/header_or_footer_rels.rs | 43 +++++++++ docx-core/src/reader/mod.rs | 77 +++++++++++----- docx-core/src/reader/rels.rs | 87 ++++++++++++++++++- 5 files changed, 190 insertions(+), 106 deletions(-) create mode 100644 docx-core/src/reader/header_or_footer_rels.rs diff --git a/docx-core/src/reader/document_rels.rs b/docx-core/src/reader/document_rels.rs index 50cd156..2df9726 100644 --- a/docx-core/src/reader/document_rels.rs +++ b/docx-core/src/reader/document_rels.rs @@ -1,10 +1,10 @@ use std::collections::BTreeMap; use std::collections::HashSet; -use std::io::{Cursor, Read}; +use std::io::Cursor; use std::path::*; -use std::str::FromStr; -use xml::reader::{EventReader, XmlEvent}; +use document_rels::rels::find_rels_filename; +use document_rels::rels::read_rels_xml; use super::errors::*; use super::*; @@ -36,84 +36,5 @@ pub fn read_document_rels( let p = p.to_str().ok_or(ReaderError::DocumentRelsNotFoundError)?; let data = read_zip(archive, p)?; let rels = read_rels_xml(&data[..], dir)?; - Ok(rels) -} - -fn read_rels_xml( - reader: R, - dir: impl AsRef, -) -> Result { - let mut parser = EventReader::new(reader); - let mut rels = ReadDocumentRels { - rels: BTreeMap::new(), - }; - loop { - let e = parser.next(); - match e { - Ok(XmlEvent::StartElement { - attributes, name, .. - }) => { - let e = XMLElement::from_str(&name.local_name).unwrap(); - if let XMLElement::Relationship = e { - let mut rel_type = "".to_owned(); - let mut rid = "".to_owned(); - let mut target_mode = None; - let mut target_string = "".to_owned(); - for a in attributes { - let local_name = &a.name.local_name; - if local_name == "Type" { - rel_type = a.value.to_owned(); - } else if local_name == "Target" { - // target_str = Path::new(dir.as_ref()).join(a.value); - target_string = a.value.to_owned(); - } else if local_name == "Id" { - rid = a.value.to_owned(); - } else if local_name == "TargetMode" { - target_mode = Some(a.value.to_owned()); - } - } - - let target = if !rel_type.ends_with("hyperlink") { - Path::new(dir.as_ref()).join(target_string) - } else { - Path::new("").join(target_string) - }; - - let current = rels.rels.remove(&rel_type); - if let Some(mut paths) = current { - paths.insert((rid, target, target_mode)); - rels.rels.insert(rel_type, paths); - } else { - let s: HashSet<(RId, PathBuf, Option)> = - vec![(rid, target, target_mode)].into_iter().collect(); - rels.rels.insert(rel_type, s); - } - continue; - } - } - Ok(XmlEvent::EndElement { name, .. }) => { - let e = XMLElement::from_str(&name.local_name).unwrap(); - if let XMLElement::Relationships = e { - break; - } - } - Err(_) => return Err(ReaderError::XMLReadError), - _ => {} - } - } - Ok(rels) -} - -fn find_rels_filename(main_path: impl AsRef) -> Result { - let path = main_path.as_ref(); - let dir = path - .parent() - .ok_or(ReaderError::DocumentRelsNotFoundError)?; - let base = path - .file_stem() - .ok_or(ReaderError::DocumentRelsNotFoundError)?; - Ok(Path::new(dir) - .join("_rels") - .join(base) - .with_extension("xml.rels")) + Ok(ReadDocumentRels { rels }) } diff --git a/docx-core/src/reader/errors.rs b/docx-core/src/reader/errors.rs index 141ed31..e265742 100644 --- a/docx-core/src/reader/errors.rs +++ b/docx-core/src/reader/errors.rs @@ -20,6 +20,8 @@ pub enum ReaderError { DocumentStylesNotFoundError, #[error("Failed to find numberings.")] DocumentNumberingsNotFoundError, + #[error("Failed to find header or footer rels.")] + HeaderOrFooterRelsNotFoundError, #[error("Unknown error")] Unknown, } diff --git a/docx-core/src/reader/header_or_footer_rels.rs b/docx-core/src/reader/header_or_footer_rels.rs new file mode 100644 index 0000000..4bc1209 --- /dev/null +++ b/docx-core/src/reader/header_or_footer_rels.rs @@ -0,0 +1,43 @@ +use std::collections::BTreeMap; +use std::collections::HashSet; +use std::io::Cursor; +use std::path::*; + +use header_or_footer_rels::rels::find_rels_filename; + +use self::rels::read_rels_xml; + +use super::errors::*; +use super::*; + +pub type RId = String; + +#[derive(Debug, Clone, PartialEq, Default)] +pub struct ReadHeaderOrFooterRels { + pub rels: BTreeMap)>>, +} + +impl ReadHeaderOrFooterRels { + pub fn find_target_path(&self, target: &str) -> Option)>> { + self.rels + .get(target) + .map(|s| s.clone().into_iter().collect()) + } +} + +pub fn read_header_or_footer_rels( + archive: &mut zip::read::ZipArchive>, + header_or_footer_path: impl AsRef, +) -> Result { + let dir = &header_or_footer_path + .as_ref() + .parent() + .ok_or(ReaderError::HeaderOrFooterRelsNotFoundError)?; + let p = find_rels_filename(&header_or_footer_path)?; + let p = p + .to_str() + .ok_or(ReaderError::HeaderOrFooterRelsNotFoundError)?; + let data = read_zip(archive, p)?; + let rels = read_rels_xml(&data[..], dir)?; + Ok(ReadHeaderOrFooterRels { rels }) +} diff --git a/docx-core/src/reader/mod.rs b/docx-core/src/reader/mod.rs index f18faf5..27fc4ec 100644 --- a/docx-core/src/reader/mod.rs +++ b/docx-core/src/reader/mod.rs @@ -21,6 +21,7 @@ mod font_scheme; mod footer; mod from_xml; mod header; +mod header_or_footer_rels; mod hyperlink; mod ignore; mod insert; @@ -62,7 +63,7 @@ mod wps_shape; mod wps_text_box; mod xml_element; -use std::{collections::HashMap, io::Cursor}; +use std::{collections::HashMap, io::Cursor, path::PathBuf}; use crate::documents::*; @@ -75,6 +76,8 @@ pub use read_zip::*; pub use xml_element::*; use zip::ZipArchive; +use self::header_or_footer_rels::{read_header_or_footer_rels, ReadHeaderOrFooterRels}; + // 2006 const DOC_RELATIONSHIP_TYPE: &str = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; @@ -107,16 +110,17 @@ const COMMENTS_EXTENDED_TYPE: &str = fn read_headers( rels: &ReadDocumentRels, archive: &mut ZipArchive>, -) -> HashMap { +) -> HashMap { let header_paths = rels.find_target_path(HEADER_TYPE); - let headers: HashMap = header_paths + let headers: HashMap = header_paths .unwrap_or_default() .into_iter() .filter_map(|(rid, path, ..)| { let data = read_zip(archive, path.to_str().expect("should have header path.")); if let Ok(d) = data { if let Ok(h) = Header::from_xml(&d[..]) { - return Some((rid, h)); + let rels = read_header_or_footer_rels(archive, path).unwrap_or_default(); + return Some((rid, (h, rels))); } } None @@ -128,16 +132,17 @@ fn read_headers( fn read_footers( rels: &ReadDocumentRels, archive: &mut ZipArchive>, -) -> HashMap { +) -> HashMap { let footer_paths = rels.find_target_path(FOOTER_TYPE); - let footers: HashMap = footer_paths + let footers: HashMap = footer_paths .unwrap_or_default() .into_iter() .filter_map(|(rid, path, ..)| { let data = read_zip(archive, path.to_str().expect("should have footer path.")); if let Ok(d) = data { if let Ok(h) = Footer::from_xml(&d[..]) { - return Some((rid, h)); + let rels = read_header_or_footer_rels(archive, path).unwrap_or_default(); + return Some((rid, (h, rels))); } } None @@ -291,11 +296,14 @@ pub fn read_docx(buf: &[u8]) -> Result { // assign headers if let Some(h) = docx.document.section_property.header_reference.clone() { - if let Some(header) = headers.get(&h.id) { + if let Some((header, rels)) = headers.get(&h.id) { docx.document = docx.document.header(header.clone(), &h.id); let count = docx.document_rels.header_count + 1; docx.document_rels.header_count = count; docx.content_type = docx.content_type.add_header(); + // Read media + let media = rels.find_target_path(IMAGE_TYPE); + docx = add_images(docx, media, &mut archive); } } if let Some(ref h) = docx @@ -304,29 +312,40 @@ pub fn read_docx(buf: &[u8]) -> Result { .first_header_reference .clone() { - if let Some(header) = headers.get(&h.id) { + if let Some((header, rels)) = headers.get(&h.id) { docx.document = docx.document.first_header(header.clone(), &h.id); let count = docx.document_rels.header_count + 1; docx.document_rels.header_count = count; docx.content_type = docx.content_type.add_header(); + // Read media + let media = rels.find_target_path(IMAGE_TYPE); + docx = add_images(docx, media, &mut archive); } } if let Some(ref h) = docx.document.section_property.even_header_reference.clone() { - if let Some(header) = headers.get(&h.id) { + if let Some((header, rels)) = headers.get(&h.id) { docx.document = docx.document.even_header(header.clone(), &h.id); let count = docx.document_rels.header_count + 1; docx.document_rels.header_count = count; docx.content_type = docx.content_type.add_header(); + + // Read media + let media = rels.find_target_path(IMAGE_TYPE); + docx = add_images(docx, media, &mut archive); } } // assign footers if let Some(f) = docx.document.section_property.footer_reference.clone() { - if let Some(footer) = footers.get(&f.id) { + if let Some((footer, rels)) = footers.get(&f.id) { docx.document = docx.document.footer(footer.clone(), &f.id); let count = docx.document_rels.footer_count + 1; docx.document_rels.footer_count = count; docx.content_type = docx.content_type.add_footer(); + + // Read media + let media = rels.find_target_path(IMAGE_TYPE); + docx = add_images(docx, media, &mut archive); } } @@ -336,19 +355,27 @@ pub fn read_docx(buf: &[u8]) -> Result { .first_footer_reference .clone() { - if let Some(footer) = footers.get(&f.id) { + if let Some((footer, rels)) = footers.get(&f.id) { docx.document = docx.document.first_footer(footer.clone(), &f.id); let count = docx.document_rels.footer_count + 1; docx.document_rels.footer_count = count; docx.content_type = docx.content_type.add_footer(); + + // Read media + let media = rels.find_target_path(IMAGE_TYPE); + docx = add_images(docx, media, &mut archive); } } if let Some(ref f) = docx.document.section_property.even_footer_reference.clone() { - if let Some(footer) = footers.get(&f.id) { + if let Some((footer, rels)) = footers.get(&f.id) { docx.document = docx.document.even_footer(footer.clone(), &f.id); let count = docx.document_rels.footer_count + 1; docx.document_rels.footer_count = count; docx.content_type = docx.content_type.add_footer(); + + // Read media + let media = rels.find_target_path(IMAGE_TYPE); + docx = add_images(docx, media, &mut archive); } } @@ -415,13 +442,7 @@ pub fn read_docx(buf: &[u8]) -> Result { } // Read media let media = rels.find_target_path(IMAGE_TYPE); - if let Some(paths) = media { - for (id, media, ..) in paths { - if let Ok(data) = read_zip(&mut archive, media.to_str().expect("should have media")) { - docx = docx.add_image(id, media.to_str().unwrap().to_string(), data); - } - } - } + docx = add_images(docx, media, &mut archive); // Read hyperlinks let links = rels.find_target_path(HYPERLINK_TYPE); @@ -436,3 +457,19 @@ pub fn read_docx(buf: &[u8]) -> Result { Ok(docx) } + +fn add_images( + mut docx: Docx, + media: Option)>>, + archive: &mut ZipArchive>, +) -> Docx { + // Read media + if let Some(paths) = media { + for (id, media, ..) in paths { + if let Ok(data) = read_zip(archive, media.to_str().expect("should have media")) { + docx = docx.add_image(id, media.to_str().unwrap().to_string(), data); + } + } + } + docx +} diff --git a/docx-core/src/reader/rels.rs b/docx-core/src/reader/rels.rs index 349955a..de5ac21 100644 --- a/docx-core/src/reader/rels.rs +++ b/docx-core/src/reader/rels.rs @@ -1,8 +1,14 @@ -use std::io::Read; -use xml::reader::{EventReader, XmlEvent}; - use super::*; use crate::reader::{FromXML, ReaderError}; +use std::str::FromStr; +use std::{ + collections::{BTreeMap, HashSet}, + io::Read, + path::{Path, PathBuf}, +}; +use xml::reader::{EventReader, XmlEvent}; + +pub type ReadRels = BTreeMap)>>; impl FromXML for Rels { fn from_xml(reader: R) -> Result { @@ -38,6 +44,81 @@ impl FromXML for Rels { } } +pub fn find_rels_filename(main_path: impl AsRef) -> Result { + let path = main_path.as_ref(); + let dir = path + .parent() + .ok_or(ReaderError::DocumentRelsNotFoundError)?; + let base = path + .file_stem() + .ok_or(ReaderError::DocumentRelsNotFoundError)?; + Ok(Path::new(dir) + .join("_rels") + .join(base) + .with_extension("xml.rels")) +} + +pub fn read_rels_xml(reader: R, dir: impl AsRef) -> Result { + let mut parser = EventReader::new(reader); + let mut rels: BTreeMap)>> = BTreeMap::new(); + + loop { + let e = parser.next(); + match e { + Ok(XmlEvent::StartElement { + attributes, name, .. + }) => { + let e = XMLElement::from_str(&name.local_name).unwrap(); + if let XMLElement::Relationship = e { + let mut rel_type = "".to_owned(); + let mut rid = "".to_owned(); + let mut target_mode = None; + let mut target_string = "".to_owned(); + for a in attributes { + let local_name = &a.name.local_name; + if local_name == "Type" { + rel_type = a.value.to_owned(); + } else if local_name == "Target" { + // target_str = Path::new(dir.as_ref()).join(a.value); + target_string = a.value.to_owned(); + } else if local_name == "Id" { + rid = a.value.to_owned(); + } else if local_name == "TargetMode" { + target_mode = Some(a.value.to_owned()); + } + } + + let target = if !rel_type.ends_with("hyperlink") { + Path::new(dir.as_ref()).join(target_string) + } else { + Path::new("").join(target_string) + }; + + let current = rels.remove(&rel_type); + if let Some(mut paths) = current { + paths.insert((rid, target, target_mode)); + rels.insert(rel_type, paths); + } else { + let s: HashSet<(RId, PathBuf, Option)> = + vec![(rid, target, target_mode)].into_iter().collect(); + rels.insert(rel_type, s); + } + continue; + } + } + Ok(XmlEvent::EndElement { name, .. }) => { + let e = XMLElement::from_str(&name.local_name).unwrap(); + if let XMLElement::Relationships = e { + break; + } + } + Err(_) => return Err(ReaderError::XMLReadError), + _ => {} + } + } + Ok(rels) +} + #[cfg(test)] mod tests {