2019独角兽企业重金招聘Python工程师标准>>>
###方法一:使用hdfs库读取HDFS文件
###在读取数据时,要加上 encoding='utf-8',否则字符串前面会有b'xxx'
###先写入list,再转为df,注意要对数据进行分列,最后要对指定字段转换数据类型
from hdfs.client import Client
client = Client("http://hadoop-1-1:50070")
lines = []
with client.read("/user/spark/H2O/Wholesale_customers_data.csv", encoding='utf-8') as reader:
for line in reader:
lines.append(line.strip())
column_str = lines[0]
column_list = column_str.split(',')
data = {"item_list":lines[1:]}
import pandas as pd
df = pd.DataFrame(data=data)
df[column_list] = df["item_list"].apply(lambda x: pd.Series([i for i in x.split(",")])) ##重新指定列
df.drop("item_list", axis=1, inplace=True) ##删除列
df.dtypes
"""
Region object
Fresh object
Milk object
Grocery object
Frozen object
Detergents_Paper object
Delicassen object
target object
dtype: object
"""
df = df.astype('int') ##将object类型转为int64
df.dtypes
"""
Region int64
Fresh int64
Milk int64
Grocery int64
Frozen int64
Detergents_Paper int64
Delicassen int64
target int64
dtype: object
"""
###方法二:采用pydoop库读取HDFS文件
import pydoop.hdfs as hdfs
lines = []
with hdfs.open('/user/spark/security/iris.csv', 'rt') as f:
for line in f:
##print(line)
lines.append(line.strip())
column_list = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']
data = {"item_list":lines[0:]}
import pandas as pd
df = pd.DataFrame(data=data)
df[column_list] = df["item_list"].apply(lambda x: pd.Series([i for i in x.split(",")])) ##重新指定列
df.drop("item_list", axis=1, inplace=True) ##删除列
##调整数据类型
df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']] = df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']].astype('float64')
df.dtypes
"""
Sepal_Length float64
Sepal_Width float64
Petal_Length float64
Petal_Width float64
Species object
dtype: object
"""
###直接运用pd.read_table进行数据读取操作
import pydoop.hdfs as hdfs
import pandas as pd
###此份数据含有表头
with hdfs.open('/user/spark/security/iris.csv', 'rt') as f:
df = pd.read_table(f)
column_list = df.columns[0].split(",")
df[column_list] = df.iloc[:,0].apply(lambda x: pd.Series([i for i in x.split(",")])) ##此处注意要写成df.iloc[:,0]
df.head()
"""
Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species Sepal_Length Sepal_Width Petal_Length Petal_Width Species
0 5.1,3.5,1.4,0.2,setosa 5.1 3.5 1.4 0.2 setosa
1 4.9,3,1.4,0.2,setosa 4.9 3 1.4 0.2 setosa
2 4.7,3.2,1.3,0.2,setosa 4.7 3.2 1.3 0.2 setosa
3 4.6,3.1,1.5,0.2,setosa 4.6 3.1 1.5 0.2 setosa
4 5,3.6,1.4,0.2,setosa 5 3.6 1.4 0.2 setosa
"""
df.drop(df.columns[0], axis=1, inplace=True)
df.dtypes
"""
Sepal_Length object
Sepal_Width object
Petal_Length object
Petal_Width object
Species object
dtype: object
"""
#####将'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'这四个字段转换为float类型
df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']] = df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']].astype('float')
df.dtypes
"""
Sepal_Length float64
Sepal_Width float64
Petal_Length float64
Petal_Width float64
Species object
dtype: object
"""
转载于:https://my.oschina/kyo4321/blog/3016864
更多推荐
python读取HDFS文件
发布评论