3.12. Read XML

  • File paths works also with URLs

  • io.StringIO Converts str to File-like object

  • pd.read_xml()

3.12.1. SetUp

>>> import pandas as pd
>>> from io import StringIO
>>>
>>> pd.set_option('display.width', 250)
>>> pd.set_option('display.max_columns', 20)
>>> pd.set_option('display.max_rows', 30)

3.12.2. Read XML

>>> DATA = """<?xml version="1.0"?>
... <catalog>
...    <book id="bk101">
...       <author>Gambardella, Matthew</author>
...       <title>XML Developer's Guide</title>
...       <genre>Computer</genre>
...       <price>44.95</price>
...       <publish_date>2000-10-01</publish_date>
...       <description>An in-depth look at creating applications
...       with XML.</description>
...    </book>
...    <book id="bk102">
...       <author>Ralls, Kim</author>
...       <title>Midnight Rain</title>
...       <genre>Fantasy</genre>
...       <price>5.95</price>
...       <publish_date>2000-12-16</publish_date>
...       <description>A former architect battles corporate zombies,
...       an evil sorceress, and her own childhood to become queen
...       of the world.</description>
...    </book>
...    <book id="bk103">
...       <author>Corets, Eva</author>
...       <title>Maeve Ascendant</title>
...       <genre>Fantasy</genre>
...       <price>5.95</price>
...       <publish_date>2000-11-17</publish_date>
...       <description>After the collapse of a nanotechnology
...       society in England, the young survivors lay the
...       foundation for a new society.</description>
...    </book>
... </catalog>
... """
>>> data = StringIO(DATA)
>>>
>>> pd.read_xml(data)
      id                author                  title     genre  price publish_date                                        description
0  bk101  Gambardella, Matthew  XML Developer's Guide  Computer  44.95   2000-10-01  An in-depth look at creating applications\n   ...
1  bk102            Ralls, Kim          Midnight Rain   Fantasy   5.95   2000-12-16  A former architect battles corporate zombies,\...
2  bk103           Corets, Eva        Maeve Ascendant   Fantasy   5.95   2000-11-17  After the collapse of a nanotechnology\n      ...

3.12.3. XML and XSLT

>>> from lxml.etree import XML, XSLT, parse
>>> from io import StringIO
>>>
>>>
>>> TEMPLATE = """
...     <html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
...         <table>
...             <thead>
...                 <tr>
...                     <th>Id</th>
...                     <th>Author</th>
...                     <th>Title</th>
...                     <th>Genre</th>
...                     <th>Price</th>
...                     <th>Publish Date</th>
...                     <th>Description</th>
...                 </tr>
...             </thead>
...             <tbody>
...
...                 <xsl:for-each select="catalog/book">
...                     <tr>
...                         <td><xsl:value-of select="@id"/></td>
...                         <td><xsl:value-of select="author"/></td>
...                         <td><xsl:value-of select="title"/></td>
...                         <td><xsl:value-of select="genre"/></td>
...                         <td><xsl:value-of select="price"/></td>
...                         <td><xsl:value-of select="publish_date"/></td>
...                         <td><xsl:value-of select="description"/></td>
...                     </tr>
...                 </xsl:for-each>
...
...             </tbody>
...         </table>
...     </html>
... """
>>>
>>> transform = XSLT(XML(TEMPLATE))
>>> data = parse(StringIO(DATA))
>>> html = StringIO(str(transform(data)))
>>> dfs = pd.read_html(html)
>>> result = dfs[0]
>>>
>>> result
      Id                Author                  Title     Genre  Price Publish Date                                        Description
0  bk101  Gambardella, Matthew  XML Developer's Guide  Computer  44.95   2000-10-01  An in-depth look at creating applications  wit...
1  bk102            Ralls, Kim          Midnight Rain   Fantasy   5.95   2000-12-16  A former architect battles corporate zombies, ...
2  bk103           Corets, Eva        Maeve Ascendant   Fantasy   5.95   2000-11-17  After the collapse of a nanotechnology  societ...
>>>
>>> type(result) is pd.DataFrame
True
>>>
>>> len(result) > 0
True
>>>
>>> result.columns
Index(['Id', 'Author', 'Title', 'Genre', 'Price', 'Publish Date', 'Description'], dtype='str')
>>>
>>> result['Title']
0    XML Developer's Guide
1            Midnight Rain
2          Maeve Ascendant
Name: Title, dtype: str

3.12.4. Assignments

# FIXME: Rozbić na trzy zadania, jedno z read_xml(file), read_xml(StringIO), oraz inne z mean

# %% About
# - Name: Pandas Read XML
# - Difficulty: medium
# - Lines: 1
# - Minutes: 3

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data from `DATA` as `pd.DataFrame`
# 2. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z `DATA` jako `pd.DataFrame`
# 2. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#   firstname    lastname  age                email   lastlogin  is_active              groups
# 0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
# 1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
# 2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
# 3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
# 4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
# 5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN

# %% Hints
# - `pip install --upgrade lxml`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
  firstname    lastname  age                email   lastlogin  is_active              groups
0     Alice     Apricot   30    alice@example.com  2000-01-01       True         users;staff
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True         users;staff
2     Carol        Corn   32    carol@example.com  2000-01-03       True               users
3      Dave      Durian   33     dave@example.org  2000-01-04       True               users
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True  users;staff;admins
5   Mallory       Melon   15  mallory@example.net         NaN      False                 NaN
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
from io import StringIO
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = """<?xml version="1.0" encoding="UTF-8"?>
<users>
  <user>
    <firstname>Alice</firstname>
    <lastname>Apricot</lastname>
    <age>30</age>
    <email>alice@example.com</email>
    <lastlogin>2000-01-01</lastlogin>
    <is_active>True</is_active>
    <groups>users;staff</groups>
  </user>
  <user>
    <firstname>Bob</firstname>
    <lastname>Blackthorn</lastname>
    <age>31</age>
    <email>bob@example.com</email>
    <lastlogin>2000-01-02</lastlogin>
    <is_active>True</is_active>
    <groups>users;staff</groups>
  </user>
  <user>
    <firstname>Carol</firstname>
    <lastname>Corn</lastname>
    <age>32</age>
    <email>carol@example.com</email>
    <lastlogin>2000-01-03</lastlogin>
    <is_active>True</is_active>
    <groups>users</groups>
  </user>
  <user>
    <firstname>Dave</firstname>
    <lastname>Durian</lastname>
    <age>33</age>
    <email>dave@example.org</email>
    <lastlogin>2000-01-04</lastlogin>
    <is_active>True</is_active>
    <groups>users</groups>
  </user>
  <user>
    <firstname>Eve</firstname>
    <lastname>Elderberry</lastname>
    <age>34</age>
    <email>eve@example.org</email>
    <lastlogin>2000-01-05</lastlogin>
    <is_active>True</is_active>
    <groups>users;staff;admins</groups>
  </user>
  <user>
    <firstname>Mallory</firstname>
    <lastname>Melon</lastname>
    <age>15</age>
    <email>mallory@example.net</email>
    <lastlogin></lastlogin>
    <is_active>False</is_active>
    <groups></groups>
  </user>
</users>
"""

# %% Result
result = ...