1
0
mirror of https://github.com/balkian/gists.git synced 2024-11-25 02:32:28 +00:00
gists/repos/5a26850fd18725fc17d698fd6d09f1ba/scrape_aliexpress.py

109 lines
4.0 KiB
Python
Raw Normal View History

2017-02-28 11:40:12 +00:00
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
#import selenium.webdriver.firefox.webdriver as fwb
import selenium.webdriver.chrome as cwd
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import csv
from urllib import request
from urllib.error import HTTPError
home = 'https://www.aliexpress.com/';
ATTRS = ['time', 'id', 'image', 'store', 'status', 'title', 'price', 'quantity']
# ff_bin = fwb.FirefoxBinary(firefox_path='/usr/bin/firefox')
# ff_profile = fwb.FirefoxProfile()
options = cwd.options.Options()
options.binary_location = '/usr/bin/chromium'
options.add_argument('--lang=en')
options.add_argument('--user-data-dir=/tmp/aliexpress')
driver = webdriver.Chrome(chrome_options=options)
# driver.set_window_size(1024, 768)
def login():
# Login
driver.get(home)
try:
lb = driver.find_element_by_link_text('Go to Global Site (English)')
lb.click()
except:
print('Already in English')
pass
wait = WebDriverWait(driver, 10)
login = wait.until(EC.element_to_be_clickable((By.LINK_TEXT,'Sign in')))
login.click()
try:
element = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'account-name')))
except Exception as ex:
print('My orders not found')
print(ex)
driver.quit()
exit()
def gotoorders():
wait = WebDriverWait(driver, 10)
element = driver.find_element_by_class_name('account-name')
element.click()
orders = wait.until(EC.element_to_be_clickable((By.LINK_TEXT,'My Orders')))
orders.click()
orders = wait.until(EC.element_to_be_clickable((By.LINK_TEXT,'My Orders')))
def scrape_orders_page():
ordertable = driver.find_element_by_id('buyer-ordertable')
for order in ordertable.find_elements_by_tag_name('tbody'):
# head = element.find_element_by_class_name('order-head')
# body = element.find_element_by_class_name('order-body')
title = order.find_element_by_class_name('product-title').text
status = order.find_element_by_class_name('order-status').text
time = order.find_element_by_xpath(".//span[text()[contains(.,'Order time')]]/following-sibling::span[1]").text
orderid = order.find_element_by_xpath(".//span[text()[contains(.,'Order ID')]]/following-sibling::span[1]").text
store = order.find_element_by_xpath(".//span[text()[contains(.,'Store name')]]/following-sibling::span[1]").text
for product in order.find_elements_by_class_name('product-sets'):
pamount = product.find_element_by_class_name('product-amount').find_elements_by_tag_name('span')
price, quantity = (elem.text for elem in pamount[:2])
image = product.find_element_by_xpath(".//div[@class='product-left']/a/img").get_attribute('src').rsplit('_', 1)[0]
imagefile = image.split('/')[-1]
if not os.path.exists(imagefile):
try:
request.urlretrieve(image, imagefile)
except HTTPError:
print('Couldn\'t downloaded image:', image)
yield {'time': time,
'id': orderid,
'image': image,
'store': store,
'status': status,
'title': title,
'price': price,
'quantity': quantity}
def orders():
gotoorders()
yield driver
yield from next_orders_page()
def next_orders_page():
lb = driver.find_element_by_link_text('Next')
lb.click()
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'buyer-ordertable')))
yield driver
yield from next_orders_page()
login()
from itertools import islice
with open('list.csv', 'w') as f:
fw = csv.DictWriter(f, ATTRS)
fw.writeheader()
for i in islice(orders(), None):
for element in scrape_orders_page():
fw.writerow(element)
# driver.save_screenshot('screen.png')