spider_get_and_check_ssq.py 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. # -*-coding: utf-8 -*-
  2. import datetime
  3. import os
  4. import sqlite3
  5. from selenium import webdriver
  6. import httpx
  7. def get_cookies(url):
  8. chrome_options = webdriver.ChromeOptions()
  9. args = ['--headless', '--no-sandbox', '--disable-gpu', '--disable-dev-shm-usage']
  10. for arg in args:
  11. chrome_options.add_argument(arg)
  12. driver = webdriver.Chrome(options=chrome_options)
  13. driver.get(url)
  14. result_cookie = driver.get_cookies()
  15. if result_cookie:
  16. return result_cookie
  17. else:
  18. pass
  19. def req(url, cookies):
  20. with httpx.Client() as client:
  21. headers = {
  22. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  23. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  24. "Connection": "keep-alive",
  25. "Cookie": cookies,
  26. "Host": "www.cwl.gov.cn",
  27. "User-Agent": "Mozilla/5.0"
  28. }
  29. res = client.get(url, headers=headers, follow_redirects=True)
  30. if res.status_code != 200:
  31. print(res.status_code)
  32. log_file_path = os.path.join(get_path.get_logs_path(), str(datetime.date.today()) + '.log')
  33. with open(log_file_path, 'a') as f:
  34. f.write("\n spider_dlt: %s")
  35. return
  36. res_json = res.json()
  37. data_handle(res_json['result'])
  38. def data_handle(source_data):
  39. ssq_db_path = os.path.join(utils_get_path.get_db_path(), 'ssq.db')
  40. conn = sqlite3.connect(ssq_db_path)
  41. c = conn.cursor()
  42. c.execute('drop table if exists data;')
  43. c.execute(
  44. 'create table if not exists `ssq` (id INT PRIMARY KEY NOT NULL, `code` varchar(10),`red1` varchar(2),`red2` varchar(2),`red3` varchar(2),`red4` varchar(2),`red5` varchar(2),`red6` varchar(2),`blue` varchar(2),`date` varchar(12),`sales` varchar(15),`poolmoney` varchar(15),`content` varchar(255));')
  45. id = 1
  46. for data in source_data:
  47. insert_sql = "INSERT INTO ssq ('id', 'code', 'red1', 'red2', 'red3', 'red4', 'red5', 'red6', 'blue', 'date', 'sales', 'poolmoney', 'content') VALUES ({0}, '{1}', '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}')".format(
  48. id,
  49. data.setdefault('code'),
  50. data.setdefault('red').split(',')[0],
  51. data.setdefault('red').split(',')[1],
  52. data.setdefault('red').split(',')[2],
  53. data.setdefault('red').split(',')[3],
  54. data.setdefault('red').split(',')[4],
  55. data.setdefault('red').split(',')[5],
  56. data.setdefault('blue'),
  57. data.setdefault('date'),
  58. data.setdefault('sales'),
  59. data.setdefault('poolmoney'),
  60. data.setdefault('content')
  61. )
  62. c.execute(insert_sql)
  63. conn.commit()
  64. id += 1
  65. conn.close()
  66. if __name__ == "__main__":
  67. url = 'http://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice?name=ssq&issueCount=&issueStart=&issueEnd=&dayStart=&dayEnd=&pageNo=1&pageSize=10&week=&systemType=PC'
  68. # result_cookie = util_get_cookies.get_cookies(url)
  69. #
  70. # cookies = '{}={}'.format(result_cookie[0].setdefault('name'), result_cookie[0].setdefault('value'))
  71. #
  72. # print(cookies)
  73. # 测试时使用的 cookies
  74. cookies = "HMF_CI=1b2fd73192f2054a429b2bfa4f58c3ff98119441420133cc8a04ca9c95aa2266eaec5bb7cf1d37df5f9864b8629ba407bacc9c58cadf26e2d726582df3870b0969"
  75. req(url, cookies)