本文作者: 小森同學
阿里雲Elasticsearch客戶真實實踐分享
文中涉及到的圖片特徵提取,使用了yongyuan.name的VGGNet庫,再此表示感謝!
“圖片搜索”是作為導購類網站比較常見的一種功能,其實現的方式有很多,比如“哈西指紋+漢明距離計算”、“特徵向量+milvus”,但在實際的應用場景中,要做到快速、精準、簡單等特性是比較困難的事情。
“圖片搜索”方式優缺點對比
方案三查詢效果:
四步搭建“以圖搜圖”搜索引擎
以下是基於 阿里雲 Elasticsearch 6.7 版本,通過安裝阿里雲 Elasticsearch 向量檢索插件【aliyun-knn】 實現,且設計圖片向量特徵為512維度。
如果自建 Elasticsearch ,是無法使用aliyun-knn插件的,自建建議使用開源 Elasticsearch 7.x版本,並安裝fast-elasticsearch-vector-scoring插件(https://github.com/lior-k/fast-elasticsearch-vector-scoring/)
一、 Elasticsearch 索引設計
1.1、索引結構
1. # 創建一個圖片索引
2. PUT images_v2
3. {
4. "aliases": {
5. "images": {}
6. },
7. "settings": {
8. "index.codec": "proxima",
9. "index.vector.algorithm": "hnsw",
10. "index.number_of_replicas":1,
11. "index.number_of_shards":3
12. },
13. "mappings": {
14. "_doc": {
15. "properties": {
16. "feature": {
17. "type": "proxima_vector",
18. "dim": 512
19. },
20. "relation_id": {
21. "type": "keyword"
22. },
23. "image_path": {
24. "type": "keyword"
25. }
26. }
27. }
28. }
29. }
1.2、DSL 語句
1. GET images/_search
2.
3. "query": {
4. "hnsw": {
5. "feature": {
6. "vector": [255,....255],
7. "size": 3,
8. "ef": 1
9. }
10. }
11. },
12. "from": 0,
13. "size": 20,
14. "sort": [
15. {
16. "_score": {
17. "order": "desc"
18. }
19. }
20. ],
21. "collapse": {
22. "field": "relation_id"
23. },
24. "_source": {
25. "includes": [
26. "relation_id",
27. "image_path"
28. ]
29. }
二、 圖片特徵
extract_cnn_vgg16_keras.py
1. # -*- coding: utf-8 -*-
2. # Author: yongyuan.name
3. import numpy as np
4. from numpy import linalg as LA
5. from keras.applications.vgg16 import VGG16
6. from keras.preprocessing import image
7. from keras.applications.vgg16 import preprocess_input
8. from PIL import Image, ImageFile
9. ImageFile.LOAD_TRUNCATED_IMAGES = True
10. class VGGNet:
11. def __init__(self):
12. # weights: 'imagenet'
13. # pooling: 'max' or 'avg'
14. # input_shape: (width, height, 3), width and height should >= 48
15. self.input_shape = (224, 224, 3)
16. self.weight = 'imagenet'
17. self.pooling = 'max'
18. self.model = VGG16(weights = self.weight, input_shape = (self.input_shape[0], self.input_shape[1], self.input_shape[2]), pooling = self.pooling, include_top = False)
19. self.model.predict(np.zeros((1, 224, 224 , 3)))
20. '''
21. Use vgg16 model to extract features
22. Output normalized feature vector
23. '''
24. def extract_feat(self, img_path):
25. img = image.load_img(img_path, target_size=(self.input_shape[0], self.input_shape[1]))
26. img = image.img_to_array(img)
27. img = np.expand_dims(img, axis=0)
28. img = preprocess_input(img)
29. feat = self.model.predict(img)
30. norm_feat = feat[0]/LA.norm(feat[0])
31. return norm_feat
1. # 獲取圖片特徵
2. from extract_cnn_vgg16_keras import VGGNet
3. model = VGGNet()
4. file_path = "./demo.jpg"
5. queryVec = model.extract_feat(file_path)
6. feature = queryVec.tolist()
三、 圖片特徵寫入阿里雲 Elasticsearch
helper.py
1. import re
2. import urllib.request
3. def strip(path):
4. """
5. 需要清洗的文件夾名字
6. 清洗掉Windows系統非法文件夾名字的字符串
7. :param path:
8. :return:
9. """
10. path = re.sub(r'[?\\*|“<>:/]', '', str(path))
11. return path
12.
13. def getfilename(url):
14. """
15. 通過url獲取最後的文件名
16. :param url:
17. :return:
18. """
19. filename = url.split('/')[-1]
20. filename = strip(filename)
21. return filename
22.
23. def urllib_download(url, filename):
24. """
25. 下載
26. :param url:
27. :param filename:
28. :return:
29. """
30. return urllib.request.urlretrieve(url, filename)
train.py
1. # coding=utf-8
2. import mysql.connector
3. import os
4. from helper import urllib_download, getfilename
5. from elasticsearch5 import Elasticsearch, helpers
6. from extract_cnn_vgg16_keras import VGGNet
7. model = VGGNet()
8. http_auth = ("elastic", "123455")
9. es = Elasticsearch("http://127.0.0.1:9200", http_auth=http_auth)
10. mydb = mysql.connector.connect(
11. host="127.0.0.1", # 數據庫主機地址
12. user="root", # 數據庫用戶名
13. passwd="123456", # 數據庫密碼
14. database="images"
15. )
16. mycursor = mydb.cursor()
17. imgae_path = "./images/"
18. def get_data(page=1):
19. page_size = 20
20. offset = (page - 1) * page_size
21. sql = """
22. SELECT id, relation_id, photo FROM images LIMIT {0},{1}
23. """
24. mycursor.execute(sql.format(offset, page_size))
25. myresult = mycursor.fetchall()
26. return myresult
27.
28. def train_image_feature(myresult):
29. indexName = "images"
30. photo_path = "http://域名/{0}"
31. actions = []
32. for x in myresult:
33. id = str(x[0])
34. relation_id = x[1]
35. # photo = x[2].decode(encoding="utf-8")
36. photo = x[2]
37. full_photo = photo_path.format(photo)
38. filename = imgae_path + getfilename(full_photo)
39. if not os.path.exists(filename):
40. try:
41. urllib_download(full_photo, filename)
42. except BaseException as e:
43. print("gid:{0}的圖片{1}未能下載成功".format(gid, full_photo))
44. continue
45. if not os.path.exists(filename):
46. continue
47. try:
48. feature = model.extract_feat(filename).tolist()
49. action = {
50. "_op_type": "index",
51. "_index": indexName,
52. "_type": "_doc",
53. "_id": id,
54. "_source": {
55. "relation_id": relation_id,
56. "feature": feature,
57. "image_path": photo
58. }
59. }
60. actions.append(action)
61. except BaseException as e:
62. print("id:{0}的圖片{1}未能獲取到特徵".format(id, full_photo))
63. continue
64. # print(actions)
65. succeed_num = 0
66. for ok, response in helpers.streaming_bulk(es, actions):
67. if not ok:
68. print(ok)
69. print(response)
70. else:
71. succeed_num += 1
72. print("本次更新了{0}條數據".format(succeed_num))
73. es.indices.refresh(indexName)
74.
75. page = 1
76. while True:
77. print("當前第{0}頁".format(page))
78. myresult = get_data(page=page)
79. if not myresult:
80. print("沒有獲取到數據了,退出")
81. break
82. train_image_feature(myresult)
83. page += 1
四、 搜索圖片
1. import requests
2. import json
3. import os
4. import time
5. from elasticsearch5 import Elasticsearch
6. from extract_cnn_vgg16_keras import VGGNet
7. model = VGGNet()
8. http_auth = ("elastic", "123455")
9. es = Elasticsearch("http://127.0.0.1:9200", http_auth=http_auth)
10. #上傳圖片保存
11. upload_image_path = "./runtime/"
12. upload_image = request.files.get("image")
13. upload_image_type = upload_image.content_type.split('/')[-1]
14. file_name = str(time.time())[:10] + '.' + upload_image_type
15. file_path = upload_image_path + file_name
16. upload_image.save(file_path)
17. # 計算圖片特徵向量
18. queryVec = model.extract_feat(file_path)
19. feature = queryVec.tolist()
20. # 刪除圖片
21. os.remove(file_path)
22. # 根據特徵向量去ES中搜索
23. body = {
24. "query": {
25. "hnsw": {
26. "feature": {
27. "vector": feature,
28. "size": 5,
29. "ef": 10
30. }
31. }
32. },
33. # "collapse": {
34. # "field": "relation_id"
35. # },
36. "_source": {"includes": ["relation_id", "image_path"]},
37. "from": 0,
38. "size": 40
39. }
40. indexName = "images"
41. res = es.search(indexName, body=body)
42. # 返回的結果,最好根據自身情況,將得分低的過濾掉...經過測試, 得分在0.65及其以上的,比較符合要求
依賴的包
1. mysql_connector_repackaged
2. elasticsearch
3. Pillow
4. tensorflow
5. requests
6. pandas
7. Keras
8. numpy
總結:
從“用戶體驗”角度考慮,在可感知層面,速度和精準度決定了產品在用戶使用過程中,是否滿足“好用”的感覺,通過阿里雲 Elasticsearch 向量檢索(aliyun-knn)簡單四步搭建的“以圖搜圖”搜索引擎,不僅滿足“好用”,同時操作簡單一步到位的特徵,也加分不少。
相關活動
更多折扣活動,請訪問阿里雲 Elasticsearch 官網
• 阿里雲 Elasticsearch 商業通用版,1核2G首月免費
• 阿里雲 Elasticsearch 日誌增強版,首月六折,年付六折
• 阿里雲 Logstash 2核4G首月免費
如有相關實踐的同學,歡迎關注我們公眾號投稿