crawler/episode.py at master · darkblank/crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Episode = namedtuple('Episode', ['no', 'img_url', 'title', 'rating', 'created_date'])
import os
from urllib.parse import urlencode

import requests
from bs4 import BeautifulSoup


class Episode:
    """
    namedtuple 'Episode'와 같은 역할을 할 수 있도록 생성
    """

    def __init__(self, webtoon, no, url_thumbnail, title, rating, created_date):
        self.webtoon = webtoon
        self.no = no
        self.url_thumbnail = url_thumbnail
        self.title = title
        self.rating = rating
        self.created_date = created_date

        self.thumbnail_dir = f'webtoon/{self.webtoon.title}/{self.webtoon.title_id}_thumbnail'
        self.image_dir = 'webtoon/%s/%s_images/%s' % (self.webtoon.title, self.webtoon.title_id, self.no)
        self.episode_dir = f'webtoon/{self.webtoon.title}/{self.webtoon.title_id}_main'
        # ex) webtoon/669233_images/1/01.jpg
        # ex) webtoon/669233_images/1/02.jpg
        # ex) webtoon/669233_images/1/03.jpg
        self.save_thumbnail()
        self.save_contents()

    # @property
    # def webtoon(self):
    #     return self.webtoon
    #
    # @property
    # def no(self):
    #     return self.no
    #
    # @property
    # def url_thumbnail(self):
    #     return self.url_thumbnail
    #
    # @property
    # def title(self):
    #     return self.title
    #
    # @property
    # def rating(self):
    #     return self.rating
    #
    # @property
    # def created_date(self):
    #     return self.created_date

    @property
    def has_thumbnail(self):
        """
        현재경로/webtoon/{self.webtoon.title_id}_thumbnail/{self.no}.jpg
          파일이 있는지 검사 후 리턴
        :return:
        """
        path = f'{self.thumbnail_dir}/{self.no}.jpg'
        return os.path.exists(path)

    def save_thumbnail(self, force_update=False):
        """
        Episode자신의 img_url에 있는 이미지를 저장한다
        :param force_update:
        :return:
        """
        if not self.has_thumbnail or force_update:
            # webtoon/{self.webtoon.title_id}에 해당하는 폴더 생성
            os.makedirs(self.thumbnail_dir, exist_ok=True)
            response = requests.get(self.url_thumbnail)
            filepath = f'{self.thumbnail_dir}/{self.no}.jpg'
            if not os.path.exists(filepath):
                with open(filepath, 'wb') as f:
                    f.write(response.content)

    def save_contents(self):
        """

        :return:
        """
        self._save_images()
        self._make_html()

    def _save_images(self, force_update=False):
        """
        자기자신 페이지 (각 episode페이지)의 img들을 다운로드
        webtoon
            /{self.webtoon.title_id}_images
                /{self.episode.no}
                    /{각 loop index}.jpg
        :return:
        """
        os.makedirs(self.image_dir, exist_ok=True)

        # 웹툰 본문 페이지 (url_contents)
        params = {
            'titleId': self.webtoon.title_id,
            'no': self.no
        }
        url_contents = 'http://comic.naver.com/webtoon/detail.nhn?' \
                       + urlencode(params)
        # 본문 페이지에 대한 HTTP요청 응답
        response = requests.get(url_contents)
        # 응답의 text를 이용해 Soup객체 생성
        soup = BeautifulSoup(response.text)
        # soup객체에서 img tag들의 목록을 찾아내기
        img_list = soup.select_one('.wt_viewer').find_all('img')
        # img tag들에서 'src'속성만 가져와 url_img_list리스트를 생성
        url_img_list = [img['src'] for img in img_list]

        # 리스트를 순회하며 (각 item은 img의 src가 된다)
        for index, url in enumerate(url_img_list):
            force = os.path.exists(f'{self.image_dir}/{index + 1}.jpg')
            if not force or force_update:
                # img에 대한 각 requests.get에는 url_contents가 Referer인 header가 필요
                headers = {
                    'Referer': url_contents
                }
                # requests.get요청을 보냄
                response = requests.get(url, headers=headers)
                # 파일을 저장
                with open(f'{self.image_dir}/{index + 1}.jpg', 'wb') as f:
                    f.write(response.content)

    def _make_html(self, force_update=False):
        force = os.path.exists(f'{self.episode_dir}/{self.no}.html')
        if not force or force_update:
            os.makedirs(self.episode_dir, exist_ok=True)
            detail_html = open('html/detail_html.html', 'rt').read()
            detail_html = detail_html.replace(
                '*title*', '%s - %s' % (self.webtoon.title, self.title)
            )
            img_list_html = ''
            for file in os.listdir(self.image_dir):
                cur_img_tag = '<img src="../../../%s/%s">' % (self.image_dir, file)
                img_list_html += cur_img_tag

            detail_html = detail_html.replace('*contents*', img_list_html)
            with open(f'{self.episode_dir}/{self.no}.html', 'wt') as f:
                f.write(detail_html)

# if __name__ == '__main__':
#     el = pickle.load(open('db/697680.txt', 'rb'))
#     e = el[0]
#     e._save_images()