3.14. Read Recap

3.14.1. SetUp

>>> import pandas as pd
>>>
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

3.14.2. Assignments

# %% About
# - Name: Pandas Read CSV Replace
# - Difficulty: easy
# - Lines: 5
# - Minutes: 8

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data from `DATA` to `result: pd.DataFrame`
# 2. Use provided column names in `COLUMNS`
# 3. Read labels from the first row
# 4. Replace data in `label` column with values extracted above
# 5. Define `result: pd.DataFrame` with 25 first rows
# 6. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z `DATA` do `result: pd.DataFrame`
# 2. Użyj podanych w `COLUMNS` nazw kolumn
# 3. Wczytaj nazwy etykiet z pierwszego wiersza
# 4. Podmień dane w kolumnie `label` na wartości wyciągnięte powyżej
# 5. Zdefiniuj `result: pd.DataFrame` z 25 pierwszymi wierszami
# 6. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `dict()`
# - `enumerate()`
# - `DataFrame.read_csv(nrows, names, skiprows)`
# - `DataFrame.replace(to_replace={'column': ...})`
# - `DataFrame.head(n)`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> assert len(result) == 25, \
'Variable `result` has an invalid length; expected: `25`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result.loc[[0,1,2,3,4,5], ['mean radius', 'mean texture', 'label']]
   mean radius  mean texture      label
0        17.99         10.38  malignant
1        20.57         17.77  malignant
2        19.69         21.25  malignant
3        11.42         20.38  malignant
4        20.29         14.34  malignant
5        12.45         15.70  malignant

>>> result['label'].value_counts()
label
malignant    22
benign        3
Name: count, dtype: int64
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/breast-cancer.csv'

COLUMNS = [
    'mean radius', 'mean texture', 'mean perimeter', 'mean area',
    'mean smoothness', 'mean compactness', 'mean concavity',
    'mean concave points', 'mean symmetry', 'mean fractal dimension',
    'radius error', 'texture error', 'perimeter error', 'area error',
    'smoothness error', 'compactness error', 'concavity error',
    'concave points error', 'symmetry error',
    'fractal dimension error', 'worst radius', 'worst texture',
    'worst perimeter', 'worst area', 'worst smoothness',
    'worst compactness', 'worst concavity', 'worst concave points',
    'worst symmetry', 'worst fractal dimension', 'label',
]

# %% Result
result = ...

# %% About
# - Name: Pandas Read HTML
# - Difficulty: easy
# - Lines: 2
# - Minutes: 5

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data from `DATA` as `data: pd.DataFrame`
# 2. Define `result` with active European Space Agency astronauts
# 3. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z `DATA` jako `data: pd.DataFrame`
# 2. Zdefiniuj `result` z aktywnymi astronautami Europejskiej Agencji Kosmicznej
# 3. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `pip install --upgrade lxml`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result['Name']
0      Samantha Cristoforetti
1             Alexander Gerst
2            Andreas Mogensen
3              Luca Parmitano
4              Thomas Pesquet
5             Matthias Maurer
6             Rosemary Coogan
7               Sophie Adenot
8     Pablo Álvarez Fernández
9            Raphaël Liégeois
10               Marco Sieber
Name: Name, dtype: str
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/european-astronaut-corps.html'

# %% Result
result = ...

# %% About
# - Name: Pandas Read JSON
# - Difficulty: easy
# - Lines: 1
# - Minutes: 3

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data from `DATA` as `result: pd.DataFrame`
# 2. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z DATA jako `result: pd.DataFrame`
# 2. Uruchom doctesty - wszystkie muszą się powieść

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result.loc[[0,10,20]]
    sepal_length  sepal_width  petal_length  petal_width     species
0            5.1          3.5           1.4          0.2      setosa
10           7.0          3.2           4.7          1.4  versicolor
20           6.3          3.3           6.0          2.5   virginica
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/iris.json'

# %% Result
result = ...

# %% About
# - Name: Pandas Read JSON OpenAPI
# - Difficulty: medium
# - Lines: 3
# - Minutes: 5

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Import `requests` module
# 2. Define `resp` with result of `requests.get()` for `DATA`
# 3. Define `data` with conversion of `resp` from JSON to Python dict by calling `.json()` on `resp`
# 4. Define `result: pd.DataFrame` from value for key `paths` in `data` dict
# 5. Run doctests - all must succeed

# %% Polish
# 1. Zaimportuj moduł `requests`
# 2. Zdefiniuj `resp` z rezultatem `requests.get()` dla `DATA`
# 3. Zdefiniuj `data` z przekształceniem `resp` z JSON do Python dict wywołując `.json()` na `resp`
# 4. Zdefiniuj `result: pd.DataFrame` dla wartości z klucza `paths` w słowniku `data`
# 5. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `pd.DataFrame(data)`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> list(result.index)
['put', 'post', 'get', 'delete']

>>> list(result.columns)  # doctest: +NORMALIZE_WHITESPACE
['/pet', '/pet/findByStatus', '/pet/findByTags', '/pet/{petId}', '/pet/{petId}/uploadImage',
 '/store/inventory', '/store/order', '/store/order/{orderId}',
 '/user', '/user/createWithList', '/user/login', '/user/logout', '/user/{username}']
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd
import requests

# %% Types
resp: requests.models.Response
data: dict
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/openapi.json'

# %% Result
resp = ...
data = ...
result = ...

# FIXME: Rozbić na trzy zadania, jedno z read_xml(file), read_xml(StringIO), oraz inne z mean

# %% About
# - Name: Pandas Read XML
# - Difficulty: medium
# - Lines: 1
# - Minutes: 3

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data from `DATA` as `pd.DataFrame`
# 2. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z `DATA` jako `pd.DataFrame`
# 2. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#   firstname    lastname  age                email   lastlogin  is_active
# 0     Alice     Apricot   30    alice@example.com  2000-01-01       True
# 1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True
# 2     Carol        Corn   32    carol@example.com  2000-01-03       True
# 3      Dave      Durian   33     dave@example.org  2000-01-04       True
# 4       Eve  Elderberry   34      eve@example.org  2000-01-05       True
# 5   Mallory       Melon   15  mallory@example.net         NaN      False

# %% Hints
# - `pip install --upgrade lxml`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
  firstname    lastname  age                email   lastlogin  is_active
0     Alice     Apricot   30    alice@example.com  2000-01-01       True
1       Bob  Blackthorn   31      bob@example.com  2000-01-02       True
2     Carol        Corn   32    carol@example.com  2000-01-03       True
3      Dave      Durian   33     dave@example.org  2000-01-04       True
4       Eve  Elderberry   34      eve@example.org  2000-01-05       True
5   Mallory       Melon   15  mallory@example.net         NaN      False
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
from io import StringIO
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = """<?xml version="1.0" encoding="UTF-8"?>
<users>
  <user>
    <firstname>Alice</firstname>
    <lastname>Apricot</lastname>
    <age>30</age>
    <email>alice@example.com</email>
    <lastlogin>2000-01-01</lastlogin>
    <is_active>True</is_active>
  </user>
  <user>
    <firstname>Bob</firstname>
    <lastname>Blackthorn</lastname>
    <age>31</age>
    <email>bob@example.com</email>
    <lastlogin>2000-01-02</lastlogin>
    <is_active>True</is_active>
  </user>
  <user>
    <firstname>Carol</firstname>
    <lastname>Corn</lastname>
    <age>32</age>
    <email>carol@example.com</email>
    <lastlogin>2000-01-03</lastlogin>
    <is_active>True</is_active>
  </user>
  <user>
    <firstname>Dave</firstname>
    <lastname>Durian</lastname>
    <age>33</age>
    <email>dave@example.org</email>
    <lastlogin>2000-01-04</lastlogin>
    <is_active>True</is_active>
  </user>
  <user>
    <firstname>Eve</firstname>
    <lastname>Elderberry</lastname>
    <age>34</age>
    <email>eve@example.org</email>
    <lastlogin>2000-01-05</lastlogin>
    <is_active>True</is_active>
  </user>
  <user>
    <firstname>Mallory</firstname>
    <lastname>Melon</lastname>
    <age>15</age>
    <email>mallory@example.net</email>
    <lastlogin></lastlogin>
    <is_active>False</is_active>
  </user>
</users>
"""

# %% Result
result = ...