Files
2026-05-11 13:03:04 +03:00

75 lines
1.5 KiB
Go

package parse
import (
"archive/zip"
"bytes"
"encoding/xml"
"errors"
"fmt"
"io"
"strings"
)
// ExtractDOCX reads a .docx file and returns its plain text.
// DOCX is a ZIP archive; we unzip word/document.xml, walk <w:t> nodes
// for text, and emit a newline at each <w:p> boundary.
func ExtractDOCX(r io.Reader) (string, error) {
data, err := io.ReadAll(r)
if err != nil {
return "", fmt.Errorf("read docx: %w", err)
}
zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
if err != nil {
return "", fmt.Errorf("open docx zip: %w", err)
}
var docFile *zip.File
for _, f := range zr.File {
if f.Name == "word/document.xml" {
docFile = f
break
}
}
if docFile == nil {
return "", errors.New("word/document.xml not found in docx")
}
rc, err := docFile.Open()
if err != nil {
return "", fmt.Errorf("open document.xml: %w", err)
}
defer rc.Close()
return parseDocXML(rc)
}
func parseDocXML(r io.Reader) (string, error) {
dec := xml.NewDecoder(r)
var sb strings.Builder
var inText bool
for {
tok, err := dec.Token()
if err == io.EOF {
break
}
if err != nil {
return "", fmt.Errorf("parse document.xml: %w", err)
}
switch t := tok.(type) {
case xml.StartElement:
if t.Name.Local == "t" {
inText = true
}
case xml.EndElement:
if t.Name.Local == "t" {
inText = false
}
if t.Name.Local == "p" {
sb.WriteByte('\n')
}
case xml.CharData:
if inText {
sb.Write([]byte(t))
}
}
}
return strings.TrimSpace(sb.String()), nil
}