1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
# 查看数据框前五行
df.head(5)
'''
in_tissue array_row ... n_counts clusters
AAACAATCTACTAGCA-1 1 3 ... 12054.0 12
AAACACCAATAACTGC-1 1 59 ... 18697.0 4
AAACAGAGCGACTCCT-1 1 14 ... 9192.0 0
AAACAGCTTTCAGAAG-1 1 43 ... 18037.0 7
AAACAGGGTCTATATT-1 1 47 ... 22535.0 10
'''
# 查看数据框的信息
df.info()
'''
<class 'pandas.core.frame.DataFrame'>
Index: 2939 entries, AAACAATCTACTAGCA-1 to TTGTTTGTGTAAATTC-1
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 in_tissue 2939 non-null int64
1 array_row 2939 non-null int64
2 array_col 2939 non-null int64
3 n_genes_by_counts 2939 non-null int32
4 log1p_n_genes_by_counts 2939 non-null float64
5 total_counts 2939 non-null float32
6 log1p_total_counts 2939 non-null float32
7 pct_counts_in_top_50_genes 2939 non-null float64
8 pct_counts_in_top_100_genes 2939 non-null float64
9 pct_counts_in_top_200_genes 2939 non-null float64
10 pct_counts_in_top_500_genes 2939 non-null float64
11 total_counts_mt 2939 non-null float32
12 log1p_total_counts_mt 2939 non-null float32
13 pct_counts_mt 2939 non-null float32
14 n_counts 2939 non-null float32
15 clusters 2939 non-null category
dtypes: category(1), float32(6), float64(5), int32(1), int64(3)
memory usage: 355.1+ KB
'''
# 对数据框的每一列做描述性统计
df.describe()
'''
in_tissue array_row ... pct_counts_mt n_counts
count 2939.0 2939.000000 ... 2939.0 2939.000000
mean 1.0 29.536577 ... 0.0 17354.753906
std 0.0 16.150319 ... 0.0 6646.829590
min 1.0 0.000000 ... 0.0 5027.000000
25% 1.0 17.000000 ... 0.0 12212.000000
50% 1.0 30.000000 ... 0.0 16512.000000
75% 1.0 42.000000 ... 0.0 21702.000000
max 1.0 66.000000 ... 0.0 34988.000000
[8 rows x 15 columns]
'''
# df的shape属性,可以得到df的行数和列数
df.shape
'''
(2939, 16)
'''
# 查看行名和列名
df.index
'''
Index(['AAACAATCTACTAGCA-1', 'AAACACCAATAACTGC-1', 'AAACAGAGCGACTCCT-1',
'AAACAGCTTTCAGAAG-1', 'AAACAGGGTCTATATT-1', 'AAACCCGAACGAAATC-1',
'AAACCGGGTAGGTACC-1', 'AAACCGTTCGTCCAGG-1', 'AAACCTCATGAAGTTG-1',
'AAACGAAGAACATACC-1',
...
'TTGTGGTAGGAGGGAT-1', 'TTGTGTATGCCACCAA-1', 'TTGTGTTTCCCGAAAG-1',
'TTGTTAGCAAATTCGA-1', 'TTGTTCAGTGTGCTAC-1', 'TTGTTCTAGATACGCT-1',
'TTGTTGTGTGTCAAGA-1', 'TTGTTTCATTAGTCTA-1', 'TTGTTTCCATACAACT-1',
'TTGTTTGTGTAAATTC-1'],
dtype='object', length=2939)
'''
df.columns
'''
Index(['in_tissue', 'array_row', 'array_col', 'n_genes_by_counts',
'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes',
'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts',
'clusters'],
dtype='object')
'''
|