3.2. Read CSV
File paths works also with URLs
3.2.1. SetUp
>>> import pandas as pd
>>>
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
3.2.2. Example
>>> DATA = 'https://python3.info/_static/martian-en.csv'
>>>
>>> pd.read_csv(DATA)
firstname lastname birthdate gender ssn email phone
0 Mark Watney October 12 1994 male 94101212345 mwatney@nasa.gov +1 (234) 555-0000
1 Melissa Lewis July 15 1995 female 95071512345 mlewis@nasa.gov +1 (234) 555-0001
2 Rick Martinez January 21 1996 male 96012112345 rmartinez@nasa.gov +1 (234) 555-0010
3 Alex Vogel November 15 1994 male 94111512345 avogel@esa.int +49 (234) 555-0011
4 Beth Johanssen May 9 2006 female 6250912345 bjohanssen@nasa.gov +1 (234) 555-0100
5 Chris Beck August 2 1999 male 99080212345 cbeck@nasa.gov +1 (234) 555-0101
3.2.3. Parse Dates
>>> DATA = 'https://python3.info/_static/martian-en.csv'
>>>
>>> pd.read_csv(DATA, parse_dates=['birthdate'])
firstname lastname birthdate gender ssn email phone
0 Mark Watney 1994-10-12 male 94101212345 mwatney@nasa.gov +1 (234) 555-0000
1 Melissa Lewis 1995-07-15 female 95071512345 mlewis@nasa.gov +1 (234) 555-0001
2 Rick Martinez 1996-01-21 male 96012112345 rmartinez@nasa.gov +1 (234) 555-0010
3 Alex Vogel 1994-11-15 male 94111512345 avogel@esa.int +49 (234) 555-0011
4 Beth Johanssen 2006-05-09 female 6250912345 bjohanssen@nasa.gov +1 (234) 555-0100
5 Chris Beck 1999-08-02 male 99080212345 cbeck@nasa.gov +1 (234) 555-0101
3.2.4. Parameters
delimiter- field separatorheader- row number(s) containing column labels and marking the start of the datanames- how to name columnsindex_col- which column should be an indexusecols- which columns to useskiprows- how many rows to skip, from the topskipfooter- how many rows to skip, from the bottomnrows- how many rows to readskip_blank_lines- skip blank lines?parse_dates- parse dates (convert to dates) values in those columnschunksize- how many rows to read at once (useful for working with data greater than available RAM)thousands- thousand separator (comma, period, space orNone)decimal- decimal separator (comma or period)encoding- file encoding, default:utf-8
>>> def read_csv(filepath_or_buffer, *, sep=..., delimiter=None,
... header='infer', names=..., index_col=None,
... usecols=None, dtype=None, engine=None, converters=None,
... true_values=None, false_values=None, skipinitialspace=False,
... skiprows=None, skipfooter=0, nrows=None, na_values=None,
... keep_default_na=True, na_filter=True, verbose=...,
... skip_blank_lines=True, parse_dates=None,
... infer_datetime_format=..., keep_date_col=...,
... date_parser=..., date_format=None, dayfirst=False,
... cache_dates=True, iterator=False, chunksize=None,
... compression='infer', thousands=None, decimal='.',
... lineterminator=None, quotechar='"', quoting=0, doublequote=True,
... escapechar=None, comment=None, encoding=None,
... encoding_errors='strict', dialect=None, on_bad_lines='error',
... delim_whitespace=..., low_memory=True, memory_map=False,
... float_precision=None, storage_options=None,
... dtype_backend=...): ...
3.2.5. Header
>>> DATA = 'https://python3.info/_static/iris-clean.csv'
>>>
>>> header = pd.read_csv(DATA, nrows=0).columns
>>>
>>> list(header)
['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
Label Encoder:
>>> DATA = 'https://python3.info/_static/iris-clean.csv'
>>>
>>> header = pd.read_csv(DATA, nrows=0)
>>> nrows, nvalues, *labels = header.columns
>>> decoder = dict(enumerate(labels))
>>>
>>> decoder
{0: 'petal_length', 1: 'petal_width', 2: 'species'}
3.2.6. Content
>>> DATA = 'https://python3.info/_static/iris-clean.csv'
>>>
>>> df = pd.read_csv(DATA)
>>> df.head(3)
sepal_length sepal_width petal_length petal_width species
0 5.4 3.9 1.3 0.4 setosa
1 5.9 3.0 5.1 1.8 virginica
2 6.0 3.4 4.5 1.6 versicolor
3.2.7. Rename Columns
>>> DATA = 'https://python3.info/_static/iris-dirty.csv'
>>>
>>> COLUMNS = ['sepal_length', 'sepal_width',
... 'petal_length', 'petal_width', 'species']
>>>
>>> SPECIES = {
... 0: 'setosa',
... 1: 'versicolor',
... 2: 'virginica',
... }
>>>
>>> df = pd.read_csv(DATA)
>>> df.head(n=3)
150 4 setosa versicolor virginica
0 5.4 3.9 1.3 0.4 0
1 5.9 3.0 5.1 1.8 2
2 6.0 3.4 4.5 1.6 1
>>>
>>> df = pd.read_csv(DATA, skiprows=1, names=COLUMNS)
>>> df.head(n=3)
sepal_length sepal_width petal_length petal_width species
0 5.4 3.9 1.3 0.4 0
1 5.9 3.0 5.1 1.8 2
2 6.0 3.4 4.5 1.6 1
>>>
>>> df = df.replace({'species': SPECIES})
>>> df.head(n=3)
sepal_length sepal_width petal_length petal_width species
0 5.4 3.9 1.3 0.4 setosa
1 5.9 3.0 5.1 1.8 virginica
2 6.0 3.4 4.5 1.6 versicolor
3.2.8. Compressed
If the extension is
.gz,.bz2,.zip, and.xz, the corresponding compression method is automatically selected
>>> df = pd.read_csv('sample_file.zip', compression='zip')
>>> df = pd.read_csv('sample_file.gz', compression='infer')
3.2.9. Use Case - 1
>>> DATA = 'https://python3.info/_static/iris-dirty.csv'
>>>
>>> COLUMNS = ['sepal_length', 'sepal_width',
... 'petal_length', 'petal_width', 'species']
>>> header = pd.read_csv(DATA, nrows=0)
>>> nrows, ncols, *class_labels = header.columns
>>> label_encoder = dict(enumerate(class_labels))
>>>
>>> label_encoder
{0: 'setosa', 1: 'versicolor', 2: 'virginica'}
>>> df = (
... pd
... .read_csv(DATA, names=COLUMNS, skiprows=1)
... .replace({'species':label_encoder})
... .head(n=5)
... )
>>> df
sepal_length sepal_width petal_length petal_width species
0 5.4 3.9 1.3 0.4 setosa
1 5.9 3.0 5.1 1.8 virginica
2 6.0 3.4 4.5 1.6 versicolor
3 7.3 2.9 6.3 1.8 virginica
4 5.6 2.5 3.9 1.1 versicolor
3.2.10. Assignments
# %% About
# - Name: Pandas ReadCSV Data
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Define variable `result` with the solution
# 3. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Zdefiniuj zmienną `result` z rozwiązaniem
# 3. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result
# firstname lastname age email lastlogin is_active groups
# 0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
# 1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
# 2 Carol Corn 32 carol@example.com 2000-01-03 True users
# 3 Dave Durian 33 dave@example.org 2000-01-04 True users
# 4 Eve Elderberry 34 eve@example.org 2000-01-05 True users;staff;admins
# 5 Mallory Melon 15 mallory@example.net NaN False NaN
# %% Hints
# - `DataFrame.read_csv()`
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
firstname lastname age email lastlogin is_active groups
0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
2 Carol Corn 32 carol@example.com 2000-01-03 True users
3 Dave Durian 33 dave@example.org 2000-01-04 True users
4 Eve Elderberry 34 eve@example.org 2000-01-05 True users;staff;admins
5 Mallory Melon 15 mallory@example.net NaN False NaN
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
# %% Types
result: pd.DataFrame
# %% Data
DATA = 'https://python3.info/_static/example.csv'
# %% Result
result = ...
# %% About
# - Name: Pandas ReadCSV Parse Date
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Parse dates in "lastlogin" column
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Sparsuj daty w kolumnie "lastlogin"
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result
# firstname lastname age email lastlogin is_active groups
# 0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
# 1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
# 2 Carol Corn 32 carol@example.com 2000-01-03 True users
# 3 Dave Durian 33 dave@example.org 2000-01-04 True users
# 4 Eve Elderberry 34 eve@example.org 2000-01-05 True users;staff;admins
# 5 Mallory Melon 15 mallory@example.net NaT False NaN
# %% Hints
# - `DataFrame.read_csv(parse_dates=...)`
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result.dtypes # doctest: +NORMALIZE_WHITESPACE
firstname str
lastname str
age int64
email str
lastlogin datetime64[us]
is_active bool
groups str
dtype: object
>>> result # doctest: +NORMALIZE_WHITESPACE
firstname lastname age email lastlogin is_active groups
0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
2 Carol Corn 32 carol@example.com 2000-01-03 True users
3 Dave Durian 33 dave@example.org 2000-01-04 True users
4 Eve Elderberry 34 eve@example.org 2000-01-05 True users;staff;admins
5 Mallory Melon 15 mallory@example.net NaT False NaN
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
# %% Types
result: pd.DataFrame
# %% Data
DATA = 'https://python3.info/_static/example.csv'
# %% Result
result = ...
# %% About
# - Name: Pandas ReadCSV Encoding
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Read data using 'utf-8' encoding
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Sparsuj dane używając kodowania 'utf-8'
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result
# firstname lastname age email lastlogin is_active groups
# 0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
# 1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
# 2 Carol Corn 32 carol@example.com 2000-01-03 True users
# 3 Dave Durian 33 dave@example.org 2000-01-04 True users
# 4 Eve Elderberry 34 eve@example.org 2000-01-05 True users;staff;admins
# 5 Mallory Melon 15 mallory@example.net NaN False NaN
# %% Hints
# - `DataFrame.read_csv(parse_dates=...)`
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
firstname lastname age email lastlogin is_active groups
0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
2 Carol Corn 32 carol@example.com 2000-01-03 True users
3 Dave Durian 33 dave@example.org 2000-01-04 True users
4 Eve Elderberry 34 eve@example.org 2000-01-05 True users;staff;admins
5 Mallory Melon 15 mallory@example.net NaN False NaN
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
# %% Types
result: pd.DataFrame
# %% Data
DATA = 'https://python3.info/_static/example.csv'
# %% Result
result = ...
# %% About
# - Name: Pandas ReadCSV Skip Rows
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Use `NAMES` as column names and skip first row
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Użyj `NAMES` jako nazw kolumn i pomiń pierwszy wiersz
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result
# fname lname age email lastlogin is_active groups
# 0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
# 1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
# 2 Carol Corn 32 carol@example.com 2000-01-03 True users
# 3 Dave Durian 33 dave@example.org 2000-01-04 True users
# 4 Eve Elderberry 34 eve@example.org 2000-01-05 True users;staff;admins
# 5 Mallory Melon 15 mallory@example.net NaN False NaN
# %% Hints
# - `DataFrame.read_csv(names=..., skiprows=...)`
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
fname lname age email lastlogin is_active groups
0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
2 Carol Corn 32 carol@example.com 2000-01-03 True users
3 Dave Durian 33 dave@example.org 2000-01-04 True users
4 Eve Elderberry 34 eve@example.org 2000-01-05 True users;staff;admins
5 Mallory Melon 15 mallory@example.net NaN False NaN
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
# %% Types
result: pd.DataFrame
# %% Data
DATA = 'https://python3.info/_static/example.csv'
NAMES = [
'fname',
'lname',
'age',
'email',
'lastlogin',
'is_active',
'groups',
]
# %% Result
result = ...
# %% About
# - Name: Pandas ReadCSV N Rows
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Read only first 3 rows
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Wczytaj tylko pierwsze 3 wiersze
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result
# firstname lastname age email lastlogin is_active groups
# 0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
# 1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
# 2 Carol Corn 32 carol@example.com 2000-01-03 True users
# %% Hints
# - `DataFrame.read_csv(nrows=...)`
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
firstname lastname age email lastlogin is_active groups
0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
2 Carol Corn 32 carol@example.com 2000-01-03 True users
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
# %% Types
result: pd.DataFrame
# %% Data
DATA = 'https://python3.info/_static/example.csv'
# %% Result
result = ...
# %% About
# - Name: Pandas ReadCSV Use Cols
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Read only columns: 'firstname', 'lastname', 'age'
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Wczytaj tylko kolumny: 'firstname', 'lastname', 'age'
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result
# firstname lastname age
# 0 Alice Apricot 30
# 1 Bob Blackthorn 31
# 2 Carol Corn 32
# 3 Dave Durian 33
# 4 Eve Elderberry 34
# 5 Mallory Melon 15
# %% Hints
# - `DataFrame.read_csv(usecols=...)`
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
firstname lastname age
0 Alice Apricot 30
1 Bob Blackthorn 31
2 Carol Corn 32
3 Dave Durian 33
4 Eve Elderberry 34
5 Mallory Melon 15
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
# %% Types
result: pd.DataFrame
# %% Data
DATA = 'https://python3.info/_static/example.csv'
# %% Result
result = ...
# %% About
# - Name: Pandas ReadCSV Index Col
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Set `email` column as index
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Ustaw kolumnę `email` jako indeks
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result
# firstname lastname age lastlogin is_active groups
# email
# alice@example.com Alice Apricot 30 2000-01-01 True users;staff
# bob@example.com Bob Blackthorn 31 2000-01-02 True users;staff
# carol@example.com Carol Corn 32 2000-01-03 True users
# dave@example.org Dave Durian 33 2000-01-04 True users
# eve@example.org Eve Elderberry 34 2000-01-05 True users;staff;admins
# mallory@example.net Mallory Melon 15 NaN False NaN
# %% Hints
# - `DataFrame.read_csv(index_col=...)`
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
firstname lastname age lastlogin is_active groups
email
alice@example.com Alice Apricot 30 2000-01-01 True users;staff
bob@example.com Bob Blackthorn 31 2000-01-02 True users;staff
carol@example.com Carol Corn 32 2000-01-03 True users
dave@example.org Dave Durian 33 2000-01-04 True users
eve@example.org Eve Elderberry 34 2000-01-05 True users;staff;admins
mallory@example.net Mallory Melon 15 NaN False NaN
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
# %% Types
result: pd.DataFrame
# %% Data
DATA = 'https://python3.info/_static/example.csv'
# %% Result
result = ...
# %% About
# - Name: Pandas ReadCSV Compression
# - Difficulty: easy
# - Lines: 1
# - Minutes: 2
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Read data `DATA` in CSV format to Pandas DataFrame
# 2. Use `zip` compression when reading the data
# 3. Define variable `result` with the solution
# 4. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane `DATA` w formacie CSV do Pandas DataFrame
# 2. Ustaw kompresję `zip` podczas wczytywania danych
# 3. Zdefiniuj zmienną `result` z rozwiązaniem
# 4. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result
# firstname lastname age email lastlogin is_active groups
# 0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
# 1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
# 2 Carol Corn 32 carol@example.com 2000-01-03 True users
# 3 Dave Durian 33 dave@example.org 2000-01-04 True users
# 4 Eve Elderberry 34 eve@example.org 2000-01-05 True users;staff;admins
# 5 Mallory Melon 15 mallory@example.net NaN False NaN
# %% Hints
# - `DataFrame.read_csv(index_col=...)`
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
firstname lastname age email lastlogin is_active groups
0 Alice Apricot 30 alice@example.com 2000-01-01 True users;staff
1 Bob Blackthorn 31 bob@example.com 2000-01-02 True users;staff
2 Carol Corn 32 carol@example.com 2000-01-03 True users
3 Dave Durian 33 dave@example.org 2000-01-04 True users
4 Eve Elderberry 34 eve@example.org 2000-01-05 True users;staff;admins
5 Mallory Melon 15 mallory@example.net NaN False NaN
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
# %% Types
result: pd.DataFrame
# %% Data
DATA = 'https://python3.info/_static/example.zip'
# %% Result
result = ...