Read header and footer image (#679)

* feat: read header images

* feat: read footer image

* update snaps
main
bokuweb 2024-02-10 10:59:35 +09:00 committed by GitHub
parent 13e5615518
commit 8224ff0663
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 190 additions and 106 deletions

View File

@ -1,10 +1,10 @@
use std::collections::BTreeMap;
use std::collections::HashSet;
use std::io::{Cursor, Read};
use std::io::Cursor;
use std::path::*;
use std::str::FromStr;
use xml::reader::{EventReader, XmlEvent};
use document_rels::rels::find_rels_filename;
use document_rels::rels::read_rels_xml;
use super::errors::*;
use super::*;
@ -36,84 +36,5 @@ pub fn read_document_rels(
let p = p.to_str().ok_or(ReaderError::DocumentRelsNotFoundError)?;
let data = read_zip(archive, p)?;
let rels = read_rels_xml(&data[..], dir)?;
Ok(rels)
}
fn read_rels_xml<R: Read>(
reader: R,
dir: impl AsRef<Path>,
) -> Result<ReadDocumentRels, ReaderError> {
let mut parser = EventReader::new(reader);
let mut rels = ReadDocumentRels {
rels: BTreeMap::new(),
};
loop {
let e = parser.next();
match e {
Ok(XmlEvent::StartElement {
attributes, name, ..
}) => {
let e = XMLElement::from_str(&name.local_name).unwrap();
if let XMLElement::Relationship = e {
let mut rel_type = "".to_owned();
let mut rid = "".to_owned();
let mut target_mode = None;
let mut target_string = "".to_owned();
for a in attributes {
let local_name = &a.name.local_name;
if local_name == "Type" {
rel_type = a.value.to_owned();
} else if local_name == "Target" {
// target_str = Path::new(dir.as_ref()).join(a.value);
target_string = a.value.to_owned();
} else if local_name == "Id" {
rid = a.value.to_owned();
} else if local_name == "TargetMode" {
target_mode = Some(a.value.to_owned());
}
}
let target = if !rel_type.ends_with("hyperlink") {
Path::new(dir.as_ref()).join(target_string)
} else {
Path::new("").join(target_string)
};
let current = rels.rels.remove(&rel_type);
if let Some(mut paths) = current {
paths.insert((rid, target, target_mode));
rels.rels.insert(rel_type, paths);
} else {
let s: HashSet<(RId, PathBuf, Option<String>)> =
vec![(rid, target, target_mode)].into_iter().collect();
rels.rels.insert(rel_type, s);
}
continue;
}
}
Ok(XmlEvent::EndElement { name, .. }) => {
let e = XMLElement::from_str(&name.local_name).unwrap();
if let XMLElement::Relationships = e {
break;
}
}
Err(_) => return Err(ReaderError::XMLReadError),
_ => {}
}
}
Ok(rels)
}
fn find_rels_filename(main_path: impl AsRef<Path>) -> Result<PathBuf, ReaderError> {
let path = main_path.as_ref();
let dir = path
.parent()
.ok_or(ReaderError::DocumentRelsNotFoundError)?;
let base = path
.file_stem()
.ok_or(ReaderError::DocumentRelsNotFoundError)?;
Ok(Path::new(dir)
.join("_rels")
.join(base)
.with_extension("xml.rels"))
Ok(ReadDocumentRels { rels })
}

View File

@ -20,6 +20,8 @@ pub enum ReaderError {
DocumentStylesNotFoundError,
#[error("Failed to find numberings.")]
DocumentNumberingsNotFoundError,
#[error("Failed to find header or footer rels.")]
HeaderOrFooterRelsNotFoundError,
#[error("Unknown error")]
Unknown,
}

View File

@ -0,0 +1,43 @@
use std::collections::BTreeMap;
use std::collections::HashSet;
use std::io::Cursor;
use std::path::*;
use header_or_footer_rels::rels::find_rels_filename;
use self::rels::read_rels_xml;
use super::errors::*;
use super::*;
pub type RId = String;
#[derive(Debug, Clone, PartialEq, Default)]
pub struct ReadHeaderOrFooterRels {
pub rels: BTreeMap<String, HashSet<(RId, PathBuf, Option<String>)>>,
}
impl ReadHeaderOrFooterRels {
pub fn find_target_path(&self, target: &str) -> Option<Vec<(RId, PathBuf, Option<String>)>> {
self.rels
.get(target)
.map(|s| s.clone().into_iter().collect())
}
}
pub fn read_header_or_footer_rels(
archive: &mut zip::read::ZipArchive<Cursor<&[u8]>>,
header_or_footer_path: impl AsRef<Path>,
) -> Result<ReadHeaderOrFooterRels, ReaderError> {
let dir = &header_or_footer_path
.as_ref()
.parent()
.ok_or(ReaderError::HeaderOrFooterRelsNotFoundError)?;
let p = find_rels_filename(&header_or_footer_path)?;
let p = p
.to_str()
.ok_or(ReaderError::HeaderOrFooterRelsNotFoundError)?;
let data = read_zip(archive, p)?;
let rels = read_rels_xml(&data[..], dir)?;
Ok(ReadHeaderOrFooterRels { rels })
}

View File

@ -21,6 +21,7 @@ mod font_scheme;
mod footer;
mod from_xml;
mod header;
mod header_or_footer_rels;
mod hyperlink;
mod ignore;
mod insert;
@ -62,7 +63,7 @@ mod wps_shape;
mod wps_text_box;
mod xml_element;
use std::{collections::HashMap, io::Cursor};
use std::{collections::HashMap, io::Cursor, path::PathBuf};
use crate::documents::*;
@ -75,6 +76,8 @@ pub use read_zip::*;
pub use xml_element::*;
use zip::ZipArchive;
use self::header_or_footer_rels::{read_header_or_footer_rels, ReadHeaderOrFooterRels};
// 2006
const DOC_RELATIONSHIP_TYPE: &str =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
@ -107,16 +110,17 @@ const COMMENTS_EXTENDED_TYPE: &str =
fn read_headers(
rels: &ReadDocumentRels,
archive: &mut ZipArchive<Cursor<&[u8]>>,
) -> HashMap<RId, Header> {
) -> HashMap<RId, (Header, ReadHeaderOrFooterRels)> {
let header_paths = rels.find_target_path(HEADER_TYPE);
let headers: HashMap<RId, Header> = header_paths
let headers: HashMap<RId, (Header, ReadHeaderOrFooterRels)> = header_paths
.unwrap_or_default()
.into_iter()
.filter_map(|(rid, path, ..)| {
let data = read_zip(archive, path.to_str().expect("should have header path."));
if let Ok(d) = data {
if let Ok(h) = Header::from_xml(&d[..]) {
return Some((rid, h));
let rels = read_header_or_footer_rels(archive, path).unwrap_or_default();
return Some((rid, (h, rels)));
}
}
None
@ -128,16 +132,17 @@ fn read_headers(
fn read_footers(
rels: &ReadDocumentRels,
archive: &mut ZipArchive<Cursor<&[u8]>>,
) -> HashMap<RId, Footer> {
) -> HashMap<RId, (Footer, ReadHeaderOrFooterRels)> {
let footer_paths = rels.find_target_path(FOOTER_TYPE);
let footers: HashMap<RId, Footer> = footer_paths
let footers: HashMap<RId, (Footer, ReadHeaderOrFooterRels)> = footer_paths
.unwrap_or_default()
.into_iter()
.filter_map(|(rid, path, ..)| {
let data = read_zip(archive, path.to_str().expect("should have footer path."));
if let Ok(d) = data {
if let Ok(h) = Footer::from_xml(&d[..]) {
return Some((rid, h));
let rels = read_header_or_footer_rels(archive, path).unwrap_or_default();
return Some((rid, (h, rels)));
}
}
None
@ -291,11 +296,14 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
// assign headers
if let Some(h) = docx.document.section_property.header_reference.clone() {
if let Some(header) = headers.get(&h.id) {
if let Some((header, rels)) = headers.get(&h.id) {
docx.document = docx.document.header(header.clone(), &h.id);
let count = docx.document_rels.header_count + 1;
docx.document_rels.header_count = count;
docx.content_type = docx.content_type.add_header();
// Read media
let media = rels.find_target_path(IMAGE_TYPE);
docx = add_images(docx, media, &mut archive);
}
}
if let Some(ref h) = docx
@ -304,29 +312,40 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
.first_header_reference
.clone()
{
if let Some(header) = headers.get(&h.id) {
if let Some((header, rels)) = headers.get(&h.id) {
docx.document = docx.document.first_header(header.clone(), &h.id);
let count = docx.document_rels.header_count + 1;
docx.document_rels.header_count = count;
docx.content_type = docx.content_type.add_header();
// Read media
let media = rels.find_target_path(IMAGE_TYPE);
docx = add_images(docx, media, &mut archive);
}
}
if let Some(ref h) = docx.document.section_property.even_header_reference.clone() {
if let Some(header) = headers.get(&h.id) {
if let Some((header, rels)) = headers.get(&h.id) {
docx.document = docx.document.even_header(header.clone(), &h.id);
let count = docx.document_rels.header_count + 1;
docx.document_rels.header_count = count;
docx.content_type = docx.content_type.add_header();
// Read media
let media = rels.find_target_path(IMAGE_TYPE);
docx = add_images(docx, media, &mut archive);
}
}
// assign footers
if let Some(f) = docx.document.section_property.footer_reference.clone() {
if let Some(footer) = footers.get(&f.id) {
if let Some((footer, rels)) = footers.get(&f.id) {
docx.document = docx.document.footer(footer.clone(), &f.id);
let count = docx.document_rels.footer_count + 1;
docx.document_rels.footer_count = count;
docx.content_type = docx.content_type.add_footer();
// Read media
let media = rels.find_target_path(IMAGE_TYPE);
docx = add_images(docx, media, &mut archive);
}
}
@ -336,19 +355,27 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
.first_footer_reference
.clone()
{
if let Some(footer) = footers.get(&f.id) {
if let Some((footer, rels)) = footers.get(&f.id) {
docx.document = docx.document.first_footer(footer.clone(), &f.id);
let count = docx.document_rels.footer_count + 1;
docx.document_rels.footer_count = count;
docx.content_type = docx.content_type.add_footer();
// Read media
let media = rels.find_target_path(IMAGE_TYPE);
docx = add_images(docx, media, &mut archive);
}
}
if let Some(ref f) = docx.document.section_property.even_footer_reference.clone() {
if let Some(footer) = footers.get(&f.id) {
if let Some((footer, rels)) = footers.get(&f.id) {
docx.document = docx.document.even_footer(footer.clone(), &f.id);
let count = docx.document_rels.footer_count + 1;
docx.document_rels.footer_count = count;
docx.content_type = docx.content_type.add_footer();
// Read media
let media = rels.find_target_path(IMAGE_TYPE);
docx = add_images(docx, media, &mut archive);
}
}
@ -415,13 +442,7 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
}
// Read media
let media = rels.find_target_path(IMAGE_TYPE);
if let Some(paths) = media {
for (id, media, ..) in paths {
if let Ok(data) = read_zip(&mut archive, media.to_str().expect("should have media")) {
docx = docx.add_image(id, media.to_str().unwrap().to_string(), data);
}
}
}
docx = add_images(docx, media, &mut archive);
// Read hyperlinks
let links = rels.find_target_path(HYPERLINK_TYPE);
@ -436,3 +457,19 @@ pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
Ok(docx)
}
fn add_images(
mut docx: Docx,
media: Option<Vec<(RId, PathBuf, Option<String>)>>,
archive: &mut ZipArchive<Cursor<&[u8]>>,
) -> Docx {
// Read media
if let Some(paths) = media {
for (id, media, ..) in paths {
if let Ok(data) = read_zip(archive, media.to_str().expect("should have media")) {
docx = docx.add_image(id, media.to_str().unwrap().to_string(), data);
}
}
}
docx
}

View File

@ -1,8 +1,14 @@
use std::io::Read;
use xml::reader::{EventReader, XmlEvent};
use super::*;
use crate::reader::{FromXML, ReaderError};
use std::str::FromStr;
use std::{
collections::{BTreeMap, HashSet},
io::Read,
path::{Path, PathBuf},
};
use xml::reader::{EventReader, XmlEvent};
pub type ReadRels = BTreeMap<String, HashSet<(RId, PathBuf, Option<String>)>>;
impl FromXML for Rels {
fn from_xml<R: Read>(reader: R) -> Result<Self, ReaderError> {
@ -38,6 +44,81 @@ impl FromXML for Rels {
}
}
pub fn find_rels_filename(main_path: impl AsRef<Path>) -> Result<PathBuf, ReaderError> {
let path = main_path.as_ref();
let dir = path
.parent()
.ok_or(ReaderError::DocumentRelsNotFoundError)?;
let base = path
.file_stem()
.ok_or(ReaderError::DocumentRelsNotFoundError)?;
Ok(Path::new(dir)
.join("_rels")
.join(base)
.with_extension("xml.rels"))
}
pub fn read_rels_xml<R: Read>(reader: R, dir: impl AsRef<Path>) -> Result<ReadRels, ReaderError> {
let mut parser = EventReader::new(reader);
let mut rels: BTreeMap<String, HashSet<(RId, PathBuf, Option<String>)>> = BTreeMap::new();
loop {
let e = parser.next();
match e {
Ok(XmlEvent::StartElement {
attributes, name, ..
}) => {
let e = XMLElement::from_str(&name.local_name).unwrap();
if let XMLElement::Relationship = e {
let mut rel_type = "".to_owned();
let mut rid = "".to_owned();
let mut target_mode = None;
let mut target_string = "".to_owned();
for a in attributes {
let local_name = &a.name.local_name;
if local_name == "Type" {
rel_type = a.value.to_owned();
} else if local_name == "Target" {
// target_str = Path::new(dir.as_ref()).join(a.value);
target_string = a.value.to_owned();
} else if local_name == "Id" {
rid = a.value.to_owned();
} else if local_name == "TargetMode" {
target_mode = Some(a.value.to_owned());
}
}
let target = if !rel_type.ends_with("hyperlink") {
Path::new(dir.as_ref()).join(target_string)
} else {
Path::new("").join(target_string)
};
let current = rels.remove(&rel_type);
if let Some(mut paths) = current {
paths.insert((rid, target, target_mode));
rels.insert(rel_type, paths);
} else {
let s: HashSet<(RId, PathBuf, Option<String>)> =
vec![(rid, target, target_mode)].into_iter().collect();
rels.insert(rel_type, s);
}
continue;
}
}
Ok(XmlEvent::EndElement { name, .. }) => {
let e = XMLElement::from_str(&name.local_name).unwrap();
if let XMLElement::Relationships = e {
break;
}
}
Err(_) => return Err(ReaderError::XMLReadError),
_ => {}
}
}
Ok(rels)
}
#[cfg(test)]
mod tests {