{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install selenium" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install beautifulsoup4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "import time\n", "import pandas as pd\n", "import os\n", "import datetime" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "USER = \"test_user\"\n", "PASS = \"test_pw\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# GoogleChromeを起動\n", "#browser = webdriver.Chrome() # Mac\n", "browser = webdriver.Chrome(executable_path = 'C:\\\\Users\\\\KinoCode\\\\Desktop\\\\MyCode\\\\chromedriver.exe') # Windows\n", "browser.implicitly_wait(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ログインページするサイトへアクセス\n", "url_login = \"https://kino-code.work/membership-login/\"\n", "browser.get(url_login)\n", "time.sleep(3)\n", "print(\"ログインページにアクセスしました\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# テキストボックス入力\n", "element = browser.find_element_by_id('swpm_user_name')\n", "element.clear()\n", "element.send_keys(USER)\n", "element = browser.find_element_by_id('swpm_password')\n", "element.clear()\n", "element.send_keys(PASS)\n", "print(\"フォームを送信\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 入力したデータをクリック\n", "browser_from = browser.find_element_by_name('swpm-login')\n", "time.sleep(3)\n", "browser_from.click()\n", "print(\"情報を入力してログインボタンを押しました\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ウェブサイトへアクセス\n", "url=\"https://kino-code.work/member-only/\"\n", "time.sleep(3)\n", "browser.get(url)\n", "print(url,\":アクセス完了\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ダウンロードボタンをクリック\n", "frm = browser.find_element_by_xpath('/html/body/div/div[3]/div/main/article/div/p[2]/button')\n", "time.sleep(1)\n", "frm.click()\n", "print('ダウンロードボタンをクリック')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "import urllib.request as req" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "html=\"\"\"\n", "\n", " \n", " \n", " キノコード\n", " \n", " \n", " \n", "

こんにちは

\n", " \n", "\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "parse_html = BeautifulSoup(html,'html.parser')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(parse_html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(parse_html.prettify())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "url = \"https://kino-code.work/python-scraping/\"\n", "response = req.urlopen(url)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "parse_html = BeautifulSoup(response,'html.parser')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "parse_html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(parse_html.title)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(parse_html.title.string)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(parse_html.find_all('a'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "title_lists=parse_html.find_all('a')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "title_lists[1:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "title_lists" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "title_lists[10].string" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "title_lists[10].attrs['href']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "title_list=[]\n", "url_list=[]\n", "\n", "for i in title_lists:\n", "# i.attrs['href']\n", " print(i.attrs['href'])\n", "# title_list.append(i.string)\n", "# url_list.append(i.attrs['href'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "title_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "url_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_title_url = pd.DataFrame({'Title':title_list, 'URL':url_list})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_title_url" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_notnull = df_title_url.dropna(how='any')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_notnull" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_notnull['Title'].str.contains('Python超入門コース')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_notnull[df_notnull['Title'].str.contains('Python超入門コース')]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_contain_python = df_notnull[df_notnull['Title'].str.contains('Python超入門コース')]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_contain_python" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_contain_python.to_csv('output.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }