mirror of
https://github.com/balkian/gists.git
synced 2024-11-22 09:42:28 +00:00
Add 'repos/5a26850fd18725fc17d698fd6d09f1ba/' from commit 'ba5ec1f9b4d488155f738e12e34a39e20bd06968'
git-subtree-dir: repos/5a26850fd18725fc17d698fd6d09f1ba git-subtree-mainline:ed3dcc754f
git-subtree-split:ba5ec1f9b4
This commit is contained in:
commit
df945ec8c4
108
repos/5a26850fd18725fc17d698fd6d09f1ba/scrape_aliexpress.py
Normal file
108
repos/5a26850fd18725fc17d698fd6d09f1ba/scrape_aliexpress.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.support.ui import Select
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
#import selenium.webdriver.firefox.webdriver as fwb
|
||||||
|
import selenium.webdriver.chrome as cwd
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
import csv
|
||||||
|
from urllib import request
|
||||||
|
from urllib.error import HTTPError
|
||||||
|
|
||||||
|
home = 'https://www.aliexpress.com/';
|
||||||
|
|
||||||
|
ATTRS = ['time', 'id', 'image', 'store', 'status', 'title', 'price', 'quantity']
|
||||||
|
|
||||||
|
|
||||||
|
# ff_bin = fwb.FirefoxBinary(firefox_path='/usr/bin/firefox')
|
||||||
|
# ff_profile = fwb.FirefoxProfile()
|
||||||
|
options = cwd.options.Options()
|
||||||
|
options.binary_location = '/usr/bin/chromium'
|
||||||
|
options.add_argument('--lang=en')
|
||||||
|
options.add_argument('--user-data-dir=/tmp/aliexpress')
|
||||||
|
driver = webdriver.Chrome(chrome_options=options)
|
||||||
|
# driver.set_window_size(1024, 768)
|
||||||
|
|
||||||
|
def login():
|
||||||
|
# Login
|
||||||
|
driver.get(home)
|
||||||
|
try:
|
||||||
|
lb = driver.find_element_by_link_text('Go to Global Site (English)')
|
||||||
|
lb.click()
|
||||||
|
except:
|
||||||
|
print('Already in English')
|
||||||
|
pass
|
||||||
|
wait = WebDriverWait(driver, 10)
|
||||||
|
login = wait.until(EC.element_to_be_clickable((By.LINK_TEXT,'Sign in')))
|
||||||
|
login.click()
|
||||||
|
try:
|
||||||
|
element = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'account-name')))
|
||||||
|
except Exception as ex:
|
||||||
|
print('My orders not found')
|
||||||
|
print(ex)
|
||||||
|
driver.quit()
|
||||||
|
exit()
|
||||||
|
|
||||||
|
def gotoorders():
|
||||||
|
wait = WebDriverWait(driver, 10)
|
||||||
|
element = driver.find_element_by_class_name('account-name')
|
||||||
|
element.click()
|
||||||
|
orders = wait.until(EC.element_to_be_clickable((By.LINK_TEXT,'My Orders')))
|
||||||
|
orders.click()
|
||||||
|
orders = wait.until(EC.element_to_be_clickable((By.LINK_TEXT,'My Orders')))
|
||||||
|
|
||||||
|
def scrape_orders_page():
|
||||||
|
ordertable = driver.find_element_by_id('buyer-ordertable')
|
||||||
|
for order in ordertable.find_elements_by_tag_name('tbody'):
|
||||||
|
# head = element.find_element_by_class_name('order-head')
|
||||||
|
# body = element.find_element_by_class_name('order-body')
|
||||||
|
title = order.find_element_by_class_name('product-title').text
|
||||||
|
status = order.find_element_by_class_name('order-status').text
|
||||||
|
time = order.find_element_by_xpath(".//span[text()[contains(.,'Order time')]]/following-sibling::span[1]").text
|
||||||
|
orderid = order.find_element_by_xpath(".//span[text()[contains(.,'Order ID')]]/following-sibling::span[1]").text
|
||||||
|
store = order.find_element_by_xpath(".//span[text()[contains(.,'Store name')]]/following-sibling::span[1]").text
|
||||||
|
for product in order.find_elements_by_class_name('product-sets'):
|
||||||
|
pamount = product.find_element_by_class_name('product-amount').find_elements_by_tag_name('span')
|
||||||
|
price, quantity = (elem.text for elem in pamount[:2])
|
||||||
|
image = product.find_element_by_xpath(".//div[@class='product-left']/a/img").get_attribute('src').rsplit('_', 1)[0]
|
||||||
|
|
||||||
|
imagefile = image.split('/')[-1]
|
||||||
|
if not os.path.exists(imagefile):
|
||||||
|
try:
|
||||||
|
request.urlretrieve(image, imagefile)
|
||||||
|
except HTTPError:
|
||||||
|
print('Couldn\'t downloaded image:', image)
|
||||||
|
yield {'time': time,
|
||||||
|
'id': orderid,
|
||||||
|
'image': image,
|
||||||
|
'store': store,
|
||||||
|
'status': status,
|
||||||
|
'title': title,
|
||||||
|
'price': price,
|
||||||
|
'quantity': quantity}
|
||||||
|
|
||||||
|
|
||||||
|
def orders():
|
||||||
|
gotoorders()
|
||||||
|
yield driver
|
||||||
|
yield from next_orders_page()
|
||||||
|
|
||||||
|
def next_orders_page():
|
||||||
|
lb = driver.find_element_by_link_text('Next')
|
||||||
|
lb.click()
|
||||||
|
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'buyer-ordertable')))
|
||||||
|
yield driver
|
||||||
|
yield from next_orders_page()
|
||||||
|
|
||||||
|
login()
|
||||||
|
from itertools import islice
|
||||||
|
with open('list.csv', 'w') as f:
|
||||||
|
fw = csv.DictWriter(f, ATTRS)
|
||||||
|
fw.writeheader()
|
||||||
|
for i in islice(orders(), None):
|
||||||
|
for element in scrape_orders_page():
|
||||||
|
fw.writerow(element)
|
||||||
|
# driver.save_screenshot('screen.png')
|
Loading…
Reference in New Issue
Block a user