shapely是python中開源的針對(duì)空間幾何進(jìn)行處理的模塊,支持點(diǎn)、線、面等基本幾何對(duì)象類型以及相關(guān)空間操作。
curve→LineString和LinearRing類;
surface→Polygon類
集合方法分別對(duì)應(yīng)MultiPoint、MultiLineString、MultiPolygon
# 導(dǎo)入所需模塊 from shapely import geometry as geo from shapely import wkt from shapely import ops import numpy as np from shapely.geometry.polygon import LinearRing from shapely.geometry import Polygon from shapely.geometry import asPoint, asLineString, asMultiPoint, asPolygon
# 創(chuàng)建point pt1 = geo.Point([0,0]) coord = np.array([0,1]) pt2 = geo.Point(coord) pt3 = wkt.loads("POINT(1 1)") geo.GeometryCollection([pt1, pt2, pt3]) #批量可視化
最終三個(gè)點(diǎn)的結(jié)果如下所示:
# point常用屬性 print(pt1.x) #pt1的x坐標(biāo) print(pt1.y)#pt1的y坐標(biāo) print(list(pt1.coords)) print(np.array(pt1))
輸出結(jié)果如下:
0.0
0.0
[(0.0, 0.0)]
[0. 0.]
# point計(jì)算距離 d = pt2.distance(pt1) #計(jì)算pt1與pt2的距離, d =1.0
創(chuàng)建LineString主要有以下三種方法:
# LineString的創(chuàng)建 line1 = geo.LineString([(0,0),(1,-0.1),(2,0.1),(3,-0.1),(5,0.1),(7,0)]) arr = np.array([(2, 2), (3, 2), (4, 3)]) line2 = geo.LineString(arr) line3 = wkt.loads("LineString(-2 -2,4 4)")
line1, line2, line3對(duì)應(yīng)的直線如下所示
LineString常用方法:
print(line2.length) #計(jì)算線段長度:2.414213562373095 print(list(line2.coords)) #線段中點(diǎn)的坐標(biāo):[(2.0, 2.0), (3.0, 2.0), (4.0, 3.0)] print(np.array(line2)) #將點(diǎn)坐標(biāo)轉(zhuǎn)成numpy.array形式[[2. 2.],[3. 2.],[4. 3.]] print(line2.bounds)#坐標(biāo)范圍:(2.0, 2.0, 4.0, 3.0) center = line2.centroid #幾何中心: geo.GeometryCollection([line2, center]) bbox = line2.envelope #最小外接矩形 geo.GeometryCollection([line2, bbox]) rect = line2.minimum_rotated_rectangle #最小旋轉(zhuǎn)外接矩形 geo.GeometryCollection([line2, rect])
line2幾何中心:
line2的最小外接矩形:
line2的最小旋轉(zhuǎn)外接矩形:
#常用方法 d1 = line1.distance(line2) #線線距離: 1.9 d2 = line1.distance(geo.Point([-1, 0])) #點(diǎn)線距離:1.0 d3 = line1.hausdorff_distance(line2) #最大最小距離:4.242640687119285 #插值 pt_half = line1.interpolate(0.5, normalized = True) geo.GeometryCollection([line1,pt_half]) #投影 ratio = line1.project(pt_half, normalized = True) print(ratio)#project()方法是和interpolate方法互逆的:0.5
插值:
DouglasPucker算法:道格拉斯-普克算法:是將曲線近似表示為一系列點(diǎn),并減少點(diǎn)的數(shù)量的一種算法。
#DouglasPucker算法 line1 = geo.LineString([(0, 0), (1, -0.2), (2, 0.3), (3, -0.5), (5, 0.2), (7,0)]) line1_simplify = line1.simplify(0.4, preserve_topology=False) print(line1)#LINESTRING (0 0, 1 -0.1, 2 0.1, 3 -0.1, 5 0.1, 7 0) print(line1_simplify)#LINESTRING (0 0, 2 0.3, 3 -0.5, 5 0.2, 7 0) buffer_with_circle = line1.buffer(0.2) #端點(diǎn)按照半圓擴(kuò)展 geo.GeometryCollection([line1,buffer_with_circle])
道格拉斯-普克算法化簡(jiǎn)后的結(jié)果
#LinearRing是一個(gè)封閉圖形 ring = LinearRing([(0, 0), (1, 1), (1, 0)]) print(ring.length)#相比于剛才的LineString的代碼示例,其長度現(xiàn)在是3.41,是因?yàn)槠湫蛄惺情]合的 print(ring.area):結(jié)果為0 geo.GeometryCollection([ring])
polygonl = Polygon([(0, 0), (1, 1), (1, 0)]) ext = [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)] int1 = [(1, 0), (0.5, 0.5), (1, 1), (1.5, 0.5), (1, 0)] polygon2 = Polygon(ext, [int1]) print(polygonl.area)#幾何對(duì)象的面積:0.5 print(polygonl.length)#幾何對(duì)象的周長:3.414213562373095 print(polygon2.area)#其面積是ext的面積減去int的面積:3.5 print(polygon2.length)#其長度是ext的長度加上int的長度:10.82842712474619 print(np.array(polygon2.exterior)) #外圍坐標(biāo)點(diǎn): #[[0. 0.] #[0. 2.] #[2. 2.] #[2. 0.] # [0. 0.]] geo.GeometryCollection([polygon2])
#obj.contains(other) == other.within(obj) coords = [(0, 0), (1, 1)] print(geo.LineString(coords).contains(geo.Point(0.5, 0.5)))#包含:True print(geo.LineString(coords).contains(geo.Point(1, 1)))#False polygon1 = Polygon([(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)]) print(polygon1.contains(geo.LineString([(1.0, 1.0), (1.0, 0)])))#面與線關(guān)系:True #contains方法也可以擴(kuò)展到面與線的關(guān)系以及面與面的關(guān)系 geo.GeometryCollection([polygon1, geo.LineString([(1.0, 1.0), (1.0, 0)])])
#obj.crosses(other):相交與否 print(geo.LineString(coords).crosses(geo.LineString([(0, 1), (1, 0)])))#:True geo.GeometryCollection([geo.LineString(coords), geo.LineString([(0, 1), (1, 0)])]) #obj.disjoint(other):均不相交返回True print(geo.Point(0, 0).disjoint(geo.Point(1, 1))) #object.intersects(other)如果該幾何對(duì)象與另一個(gè)幾何對(duì)象只要相交則返回True。 print(geo.LineString(coords).intersects(geo.LineString([(0, 1), (1, 0)])))#True
#object.convex_hull返回包含對(duì)象中所有點(diǎn)的最小凸多邊形(凸包) points1 = geo.MultiPoint([(0, 0), (1, 1), (0, 2), (2, 2), (3, 1), (1, 0)]) hull1 = points1.convex_hull geo.GeometryCollection([hull1, points1])
#object.intersection 返回對(duì)象與對(duì)象之間的交集 polygon1 = Polygon([(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)]) hull1.intersection(polygon1)
#返回對(duì)象與對(duì)象之間的并集 hull1.union(polygon1)
#面面補(bǔ)集 hull1.difference(polygon1)
pa = asPoint(np.array([0, 0])) #將numpy轉(zhuǎn)成point格式
#將numpy數(shù)組轉(zhuǎn)成LineString格式 la = asLineString(np.array(([[1.0, 2.0], [3.0, 4.0]])))
#將numpy數(shù)組轉(zhuǎn)成multipoint集合 ma = asMultiPoint(np.array([[1.1, 2.2], [3.3, 4.4], [5.5, 6.6]]))
#將numpy轉(zhuǎn)成多邊形 pg = asPolygon(np.array([[1.1, 2.2], [3.3, 4.4], [5.5, 6.6]]))
geopandas拓展了pandas,共有兩種數(shù)據(jù)類型:GeoSeries、GeoDataFrame
下述是利用geopandas庫繪制世界地圖:
import pandas as pd import geopandas import matplotlib.pyplot as plt world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) #read_file方法可以讀取shape文件 world.plot() plt.show()
world.head()
#根據(jù)每一個(gè)polygon的pop_est不同,便可以用python繪制圖表顯示不同國家的人數(shù) fig, ax = plt.subplots(figsize = (9, 6), dpi = 100) world.plot('pop_est', ax = ax, legend =True) plt.show()
python對(duì)海洋數(shù)據(jù)進(jìn)行預(yù)處理操作(這里我發(fā)現(xiàn),tqdm模塊可以顯示進(jìn)度條,感覺很高端,像下面這樣)
```python import pandas as pd import geopandas as gpd from pyproj import Proj #左邊轉(zhuǎn)換 from keplergl import KeplerGl from tqdm import tqdm import os import matplotlib.pyplot as plt from matplotlib.lines import Line2D import shapely import numpy as np from datetime import datetime import warnings warnings.filterwarnings('ignore') plt.rcParams['font.sans-serif'] = ['SimSun'] #指定默認(rèn)字體為新宋體 plt.rcParams['axes.unicode_minus'] = False
DataFrame獲取數(shù)據(jù),坐標(biāo)轉(zhuǎn)換,計(jì)算距離
#獲取文件夾中的數(shù)據(jù) def get_data(file_path, model): assert model in ['train', 'test'], '{} Not Support this type of file'.format(model) paths = os.listdir(file_path) tmp = [] for t in tqdm(range(len(paths))): p = paths[t] with open('{}/{}'.format(file_path, p), encoding = 'utf-8') as f: next(f) #讀取下一行 for line in f.readlines(): tmp.append(line.strip().split(',')) tmp_df = pd.DataFrame(tmp) if model == 'train': tmp_df.columns = ['ID', 'lat', 'lon', 'speed', 'direction', 'time', 'type'] else: tmp_df['type'] = 'unknown' tmp_df.columns = ['ID', 'lat', 'lon', 'speed', 'direction', 'time', 'type'] tmp_df['lat'] = tmp_df['lat'].astype(float) tmp_df['lon'] = tmp_df['lon'].astype(float) tmp_df['speed'] = tmp_df['speed'].astype(float) tmp_df['direction'] = tmp_df['direction'].astype(int) return tmp_df file_path = r"C:\Users\李\Desktop\datawheal\數(shù)據(jù)\hy_round1_train_20200102" model = 'train' #平面坐標(biāo)轉(zhuǎn)經(jīng)緯度 def transform_xy2lonlat(df): x = df['lat'].values y = df['lon'].values p = Proj('+proj=lcc +lat_1=33.88333333333333 +lat_2=32.78333333333333 +lat_0=32.16666666666666 +lon_0=-116.25 +x_0=2000000.0001016 +y_0=500000.0001016001 +datum=NAD83 +units=us-ft +no_defs ') df['lon'], df['lat'] = p(y, x, inverse = True) return df #修改數(shù)據(jù)的時(shí)間格式 def reformat_strtime(time_str = None, START_YEAR = '2019'): time_str_split = time_str.split(" ") #以空格為分隔符 time_str_reformat = START_YEAR + '-' + time_str_split[0][:2] + "-" + time_str_split[0][2:4] time_str_reformat = time_str_reformat + " " + time_str_split[1] return time_str_reformat #計(jì)算兩個(gè)點(diǎn)的距離 def haversine_np(lon1, lat1, lon2, lat2): lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2]) dlon = lon2 - lon1 dlat = lat2 - lat1 a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2 c = 2 * np.arcsin(np.sqrt(a)) km = 6367 * c return km * 1000
利用3-sigma算法對(duì)異常值進(jìn)行處理,速度與時(shí)間
#計(jì)算時(shí)間的差值 def compute_traj_diff_time_distance(traj = None): #計(jì)算時(shí)間的差值 time_diff_array = (traj['time'].iloc[1:].reset_index(drop = True) - traj['time'].iloc[:-1].reset_index(drop = True)).dt.total_seconds() / 60 #計(jì)算坐標(biāo)之間的距離 dist_diff_array = haversine_np(traj['lon'].values[1:], traj['lat'].values[1:], traj['lon'].values[:-1], traj['lat'].values[:-1]) #填充第一個(gè)值 time_diff_array = [time_diff_array.mean()] + time_diff_array.tolist() dist_diff_array = [dist_diff_array.mean()] + dist_diff_array.tolist() traj.loc[list(traj.index), 'time_array'] = time_diff_array traj.loc[list(traj.index), 'dist_array'] = dist_diff_array return traj #對(duì)軌跡進(jìn)行異常點(diǎn)的剔除 def assign_traj_anomaly_points_nan(traj = None, speed_maximum = 23,time_interval_maximum = 200, coord_speed_maximum = 700): #將traj中的異常點(diǎn)分配給np.nan def thigma_data(data_y, n): data_x = [i for i in range(len(data_y))] ymean = np.mean(data_y) ystd = np.std(data_y) threshold1 = ymean - n * ystd threshold2 = ymean + n * ystd judge = [] for data in data_y: if data threshold1 or data > threshold2: judge.append(True) else: judge.append(False) return judge #異常速度修改 is_speed_anomaly = (traj['speed'] > speed_maximum) | (traj['speed'] 0) traj['speed'][is_speed_anomaly] = np.nan #根據(jù)距離和時(shí)間計(jì)算速度 is_anomaly = np.array([False] * len(traj)) traj['coord_speed'] = traj['dist_array'] / traj['time_array'] #根據(jù)3-sigma算法對(duì)速度剔除以及較大的時(shí)間間隔點(diǎn) is_anomaly_tmp = pd.Series(thigma_data(traj['time_array'], 3)) | pd.Series(thigma_data(traj['coord_speed'], 3)) is_anomaly = is_anomaly | is_anomaly_tmp is_anomaly.index = traj.index #軌跡點(diǎn)的3-sigma異常處理 traj = traj[~is_anomaly].reset_index(drop = True) is_anomaly = np.array([False]*len(traj)) if len(traj) != 0: lon_std, lon_mean = traj['lon'].std(), traj['lon'].mean() lat_std, lat_mean = traj['lat'].std(), traj['lat'].mean() lon_low, lon_high = lon_mean - 3* lon_std, lon_mean + 3 * lon_std lat_low, lat_high = lat_mean - 3 * lat_std, lat_mean + 3 * lat_std is_anomaly = is_anomaly | (traj['lon'] > lon_high) | ((traj['lon'] lon_low)) is_anomaly = is_anomaly | (traj["lat"] > lat_high) | ((traj["lat"] lat_low)) traj = traj[~is_anomaly].reset_index(drop = True) return traj, [len(is_speed_anomaly) - len(traj)] file_path = r"C:\Users\李\Desktop\datawheal\數(shù)據(jù)\hy_round1_train_20200102" model = 'train' df = get_data(file_path, model) #轉(zhuǎn)換時(shí)間格式 df = transform_xy2lonlat(df) df['time'] = df['time'].apply(reformat_strtime) df['time'] = df['time'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S')) #對(duì)軌跡的異常點(diǎn)進(jìn)行剔除,對(duì)缺失值進(jìn)行線性插值處理 ID_list = list(pd.DataFrame(df['ID'].value_counts()).index) DF_NEW = [] Anomaly_count = [] for ID in tqdm(ID_list): # print(ID) df_id = compute_traj_diff_time_distance(df[df['ID'] == ID]) df_new, count = assign_traj_anomaly_points_nan(df_id) df_new['speed'] = df_new['speed'].interpolate(method = 'linear', axis = 0) df_new = df_new.fillna(method = 'bfill') #用前一個(gè)非缺失值取填充該缺失值 df_new = df_new.fillna(method = 'ffill')#用后一個(gè)非缺失值取填充該缺失值 df_new['speed'] = df_new['speed'].clip(0, 23) #clip()函數(shù)將其限定在0,23 Anomaly_count.append(count) #統(tǒng)計(jì)每個(gè)id異常點(diǎn)的數(shù)量有多少 DF_NEW.append(df_new) DF = pd.concat(DF_NEW)
處理后的DF
利用Geopandas中的Simplify進(jìn)行軌跡簡(jiǎn)化和壓縮
#道格拉斯-普克,由該案例可以看出針對(duì)相同的ID軌跡,可以先用geopandas將其進(jìn)行簡(jiǎn)化和數(shù)據(jù)壓縮 line = shapely.geometry.LineString(np.array(df[df['ID'] == '11'][['lon', 'lat']])) ax = gpd.GeoSeries([line]).plot(color = 'red') ax = gpd.GeoSeries([line]).simplify(tolerance = 0.000000001).plot(color = 'blue', ax = ax, linestyle = '--') LegendElement = [Line2D([], [], color = 'red', label = '簡(jiǎn)化前'), Line2D([], [], color = 'blue', linestyle = '--', label = '簡(jiǎn)化后')] #將制作好的圖例影響對(duì)象列表導(dǎo)入legend()中 ax.legend(handles = LegendElement, loc = 'upper left', fontsize = 10) print('化簡(jiǎn)前數(shù)據(jù)長度:' + str(len(np.array(gpd.GeoSeries([line])[0])))) print('化簡(jiǎn)后數(shù)據(jù)長度' + str(len(np.array(gpd.GeoSeries([line]).simplify(tolerance = 0.000000001)[0])))) #定義數(shù)據(jù)簡(jiǎn)化函數(shù),通過shapely庫將經(jīng)緯度轉(zhuǎn)換成LineString格式,然后通過GeoSeries數(shù)據(jù)結(jié)構(gòu)中利用simplify進(jìn)行簡(jiǎn)化,再將所有數(shù)據(jù)放入GeoDataFrame def simplify_dataframe(df): line_list = [] for i in tqdm(dict(list(df.groupby('ID')))): line_dict = {} lat_lon = dict(list(df.groupby('ID')))[i][['lon', 'lat']] line = shapely.geometry.LineString(np.array(lat_lon)) line_dict['ID'] = dict(list(df.groupby('ID')))[i].iloc[0]['ID'] line_dict['type'] = dict(list(df.groupby('ID')))[i].iloc[0]['type'] line_dict['geometry'] = gpd.GeoSeries([line]).simplify(tolerance = 0.000000001)[0] line_list.append(line_dict) return gpd.GeoDataframe(line_list)
化簡(jiǎn)前數(shù)據(jù)長度:377
化簡(jiǎn)后數(shù)據(jù)長度156
這塊的df_gpd_change沒有讀出來,后續(xù)再發(fā)
df_gpd_change=pd.read_pickle(r"C:\Users\李\Desktop\datawheal\數(shù)據(jù)\df_gpd_change.pkl") map1=KeplerGl(height=800)#zoom_start與這個(gè)height類似,表示地圖的縮放程度 map1.add_data(data=df_gpd_change,name='data') #當(dāng)運(yùn)行該代碼后,下面會(huì)有一個(gè)kepler.gl使用說明的鏈接,可以根據(jù)該鏈接進(jìn)行學(xué)習(xí)參
GeoHash編碼:利用二分法不斷縮小經(jīng)緯度區(qū)間,經(jīng)度區(qū)間二分為[-180, 0]和[0,180],緯度區(qū)間二分為[-90,0]和[0,90],偶數(shù)位放經(jīng)度,奇數(shù)位放緯度交叉,將二進(jìn)制數(shù)每五位轉(zhuǎn)化為十進(jìn)制,在對(duì)應(yīng)編碼表進(jìn)行32位編碼
def geohash_encode(latitude, longitude, precision = 12): lat_interval, lon_interval = (-90.0, 90.0), (-180, 180) base32 = '0123456789bcdefghjkmnpqrstuvwxyz' geohash = [] bits = [16, 8, 4, 2, 1] bit = 0 ch = 0 even = True while len(geohash) precision: if even: mid = (lon_interval[0] + lon_interval[1]) / 2 if longitude > mid: ch |= bits[bit] lon_interval = (mid, lon_interval[1]) else: lon_interval = (lon_interval[0], mid) else: mid = (lat_interval[0] + lat_interval[1]) / 2 if latitude > mid: ch |= bits[bit] lat_interval = (mid, lat_interval[1]) else: lat_interval = (lat_interval[0], mid) even = not even if bit 4: bit += 1 else: geohash += base32[ch] bit = 0 ch = 0 return ''.join(geohash)
到此這篇關(guān)于python爬蟲之地理數(shù)據(jù)分析的文章就介紹到這了,更多相關(guān)python地理數(shù)據(jù)內(nèi)容請(qǐng)搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家!
標(biāo)簽:日照 金華 貴州 雙鴨山 臨汾 陽泉 克拉瑪依 赤峰
巨人網(wǎng)絡(luò)通訊聲明:本文標(biāo)題《python爬蟲之教你如何爬取地理數(shù)據(jù)》,本文關(guān)鍵詞 python,爬蟲,之教,你,如何,;如發(fā)現(xiàn)本文內(nèi)容存在版權(quán)問題,煩請(qǐng)?zhí)峁┫嚓P(guān)信息告之我們,我們將及時(shí)溝通與處理。本站內(nèi)容系統(tǒng)采集于網(wǎng)絡(luò),涉及言論、版權(quán)與本站無關(guān)。