1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Copyright (c) 2010-2024 openpyxl
 
from warnings import warn
 
from openpyxl.xml.functions import fromstring
 
from openpyxl.packaging.relationship import (
    get_dependents,
    get_rels_path,
    get_rel,
)
from openpyxl.packaging.workbook import WorkbookPackage
from openpyxl.workbook import Workbook
from openpyxl.workbook.defined_name import DefinedNameList
from openpyxl.workbook.external_link.external import read_external_link
from openpyxl.pivot.cache import CacheDefinition
from openpyxl.pivot.record import RecordList
from openpyxl.worksheet.print_settings import PrintTitles, PrintArea
 
from openpyxl.utils.datetime import CALENDAR_MAC_1904
 
 
class WorkbookParser:
 
    _rels = None
 
    def __init__(self, archive, workbook_part_name, keep_links=True):
        self.archive = archive
        self.workbook_part_name = workbook_part_name
        self.defined_names = DefinedNameList()
        self.wb = Workbook()
        self.keep_links = keep_links
        self.sheets = []
 
 
    @property
    def rels(self):
        if self._rels is None:
            self._rels = get_dependents(self.archive, get_rels_path(self.workbook_part_name)).to_dict()
        return self._rels
 
 
    def parse(self):
        src = self.archive.read(self.workbook_part_name)
        node = fromstring(src)
        package = WorkbookPackage.from_tree(node)
        if package.properties.date1904:
            self.wb.epoch = CALENDAR_MAC_1904
 
        self.wb.code_name = package.properties.codeName
        self.wb.active = package.active
        self.wb.views = package.bookViews
        self.sheets = package.sheets
        self.wb.calculation = package.calcPr
        self.caches = package.pivotCaches
 
        # external links contain cached worksheets and can be very big
        if not self.keep_links:
            package.externalReferences = []
 
        for ext_ref in package.externalReferences:
            rel = self.rels.get(ext_ref.id)
            self.wb._external_links.append(
                read_external_link(self.archive, rel.Target)
            )
 
        if package.definedNames:
            self.defined_names = package.definedNames
 
        self.wb.security = package.workbookProtection
 
 
    def find_sheets(self):
        """
        Find all sheets in the workbook and return the link to the source file.
 
        Older XLSM files sometimes contain invalid sheet elements.
        Warn user when these are removed.
        """
 
        for sheet in self.sheets:
            if not sheet.id:
                msg = f"File contains an invalid specification for {0}. This will be removed".format(sheet.name)
                warn(msg)
                continue
            yield sheet, self.rels[sheet.id]
 
 
    def assign_names(self):
        """
        Bind defined names and other definitions to worksheets or the workbook
        """
 
        for idx, names in self.defined_names.by_sheet().items():
            if idx == "global":
                self.wb.defined_names = names
                continue
 
            try:
                sheet = self.wb._sheets[idx]
            except IndexError:
                warn(f"Defined names for sheet index {idx} cannot be located")
                continue
 
            for name, defn in names.items():
                reserved = defn.is_reserved
                if reserved is None:
                    sheet.defined_names[name] = defn
 
                elif reserved == "Print_Titles":
                    titles = PrintTitles.from_string(defn.value)
                    sheet._print_rows = titles.rows
                    sheet._print_cols = titles.cols
                elif reserved == "Print_Area":
                    try:
                        sheet._print_area = PrintArea.from_string(defn.value)
                    except TypeError:
                        warn(f"Print area cannot be set to Defined name: {defn.value}.")
                        continue
 
    @property
    def pivot_caches(self):
        """
        Get PivotCache objects
        """
        d = {}
        for c in self.caches:
            cache = get_rel(self.archive, self.rels, id=c.id, cls=CacheDefinition)
            if cache.deps:
                records = get_rel(self.archive, cache.deps, cache.id, RecordList)
                cache.records = records
            d[c.cacheId] = cache
        return d