e53e7662e9
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
75 lines
1.5 KiB
Go
75 lines
1.5 KiB
Go
package parse
|
|
|
|
import (
|
|
"archive/zip"
|
|
"bytes"
|
|
"encoding/xml"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
)
|
|
|
|
// ExtractDOCX reads a .docx file and returns its plain text.
|
|
// DOCX is a ZIP archive; we unzip word/document.xml, walk <w:t> nodes
|
|
// for text, and emit a newline at each <w:p> boundary.
|
|
func ExtractDOCX(r io.Reader) (string, error) {
|
|
data, err := io.ReadAll(r)
|
|
if err != nil {
|
|
return "", fmt.Errorf("read docx: %w", err)
|
|
}
|
|
zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
|
|
if err != nil {
|
|
return "", fmt.Errorf("open docx zip: %w", err)
|
|
}
|
|
var docFile *zip.File
|
|
for _, f := range zr.File {
|
|
if f.Name == "word/document.xml" {
|
|
docFile = f
|
|
break
|
|
}
|
|
}
|
|
if docFile == nil {
|
|
return "", errors.New("word/document.xml not found in docx")
|
|
}
|
|
rc, err := docFile.Open()
|
|
if err != nil {
|
|
return "", fmt.Errorf("open document.xml: %w", err)
|
|
}
|
|
defer rc.Close()
|
|
return parseDocXML(rc)
|
|
}
|
|
|
|
func parseDocXML(r io.Reader) (string, error) {
|
|
dec := xml.NewDecoder(r)
|
|
var sb strings.Builder
|
|
var inText bool
|
|
for {
|
|
tok, err := dec.Token()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return "", fmt.Errorf("parse document.xml: %w", err)
|
|
}
|
|
switch t := tok.(type) {
|
|
case xml.StartElement:
|
|
if t.Name.Local == "t" {
|
|
inText = true
|
|
}
|
|
case xml.EndElement:
|
|
if t.Name.Local == "t" {
|
|
inText = false
|
|
}
|
|
if t.Name.Local == "p" {
|
|
sb.WriteByte('\n')
|
|
}
|
|
case xml.CharData:
|
|
if inText {
|
|
sb.Write([]byte(t))
|
|
}
|
|
}
|
|
}
|
|
return strings.TrimSpace(sb.String()), nil
|
|
}
|