1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
|
# https://github.com/ProgVal/Limnoria/pull/1371
# https://github.com/ProgVal/Limnoria/issues/1362
# https://github.com/ProgVal/Limnoria/issues/1359
From 7cf91ad703ab324e38c37fa2976626505f1d569a Mon Sep 17 00:00:00 2001
From: Rodrigo <rodrigo.freebsd@minasambiente.com.br>
Date: Fri, 27 Sep 2019 01:48:36 -0300
Subject: [PATCH 1/3] make clear in logs about charade dep|make behave the same
way in py3 or py2 without errors in any case (e.g. without charade installed)
---
plugins/Web/plugin.py | 44 +++++++++++++++++--------------------------
1 file changed, 17 insertions(+), 27 deletions(-)
diff --git plugins/Web/plugin.py plugins/Web/plugin.py
index 1ac362714..0bdb68a4a 100644
--- plugins/Web/plugin.py
+++ plugins/Web/plugin.py
@@ -150,37 +150,27 @@ def getTitle(self, irc, url, raiseErrors):
size = conf.supybot.protocols.http.peekSize()
timeout = self.registryValue('timeout')
(target, text) = utils.web.getUrlTargetAndContent(url, size=size,
- timeout=timeout)
- try:
- text = text.decode(utils.web.getEncoding(text) or 'utf8',
- 'replace')
- except UnicodeDecodeError:
- pass
- if minisix.PY3 and isinstance(text, bytes):
- if raiseErrors:
- irc.error(_('Could not guess the page\'s encoding. (Try '
- 'installing python-charade.)'), Raise=True)
- else:
- return None
+ timeout=timeout)
+ encoding = utils.web.getEncoding(text)
+ if encoding is None: # Condition if charade not installed
+ self.log.info('Web plugin TitleSnarfer: Could not guess the page\'s'
+ ' encoding. (Try installing python-charade.)')
+ encoding = 'utf-8' # Assume UTF-8 and replace unknown chars to the UTF-8 codec for U+FFFD in the next hop
try:
+ text = text.decode(utils.web.getEncoding(text) or 'utf-8', 'replace')
parser = Title()
parser.feed(text)
- except UnicodeDecodeError:
- # Workaround for Python 2
- # https://github.com/ProgVal/Limnoria/issues/1359
- parser = Title()
- parser.feed(text.encode('utf8'))
- parser.close()
- title = utils.str.normalizeWhitespace(''.join(parser.data).strip())
- if title:
- return (target, title)
- elif raiseErrors:
- if len(text) < size:
- irc.error(_('That URL appears to have no HTML title.'),
- Raise=True)
+ parser.close()
+ title = utils.str.normalizeWhitespace(''.join(parser.data).strip())
+ if title:
+ return (target, title)
else:
- irc.error(format(_('That URL appears to have no HTML title '
- 'within the first %S.'), size), Raise=True)
+ if len(text) < size:
+ self.log.info('Web plugin TitleSnarfer: '
+ 'That URL appears to have no HTML title.')
+ except raiseErrors: # Can use raiseErrors here ?
+ irc.error(_('Web plugin TitleSnarfer encoding errors'),
+ Raise=True)
@fetch_sandbox
def titleSnarfer(self, irc, msg, match):
From 16247c5caf59f307b3d6910e31832fe30d0d583a Mon Sep 17 00:00:00 2001
From: Rodrigo <rodrigo.freebsd@minasambiente.com.br>
Date: Fri, 27 Sep 2019 20:12:00 -0300
Subject: [PATCH 2/3] more python2 compatibility
---
plugins/Web/plugin.py | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git plugins/Web/plugin.py plugins/Web/plugin.py
index 0bdb68a4a..85c129de3 100644
--- plugins/Web/plugin.py
+++ plugins/Web/plugin.py
@@ -150,16 +150,20 @@ def getTitle(self, irc, url, raiseErrors):
size = conf.supybot.protocols.http.peekSize()
timeout = self.registryValue('timeout')
(target, text) = utils.web.getUrlTargetAndContent(url, size=size,
- timeout=timeout)
+ timeout=timeout)
encoding = utils.web.getEncoding(text)
- if encoding is None: # Condition if charade not installed
+ if encoding is None: # Condition if charade not installed
self.log.info('Web plugin TitleSnarfer: Could not guess the page\'s'
' encoding. (Try installing python-charade.)')
- encoding = 'utf-8' # Assume UTF-8 and replace unknown chars to the UTF-8 codec for U+FFFD in the next hop
+ encoding = 'utf-8' # Assume UTF-8 and replace unknown chars to the UTF-8 codec for U+FFFD in the next hop
try:
- text = text.decode(utils.web.getEncoding(text) or 'utf-8', 'replace')
+ text = text.decode(utils.web.getEncoding(text) or 'utf-8','replace')
parser = Title()
- parser.feed(text)
+ try:
+ parser.feed(text)
+ except:
+ parser = Title()
+ parser.feed(bytes(text)) # Explicitly pack to bytes in encoding errors for (more) python2 compatibility
parser.close()
title = utils.str.normalizeWhitespace(''.join(parser.data).strip())
if title:
@@ -168,9 +172,9 @@ def getTitle(self, irc, url, raiseErrors):
if len(text) < size:
self.log.info('Web plugin TitleSnarfer: '
'That URL appears to have no HTML title.')
- except raiseErrors: # Can use raiseErrors here ?
+ except:
irc.error(_('Web plugin TitleSnarfer encoding errors'),
- Raise=True)
+ Raise=True)
@fetch_sandbox
def titleSnarfer(self, irc, msg, match):
From 5cac4a3cbda26186a330709901da2633914de415 Mon Sep 17 00:00:00 2001
From: Rodrigo <rodrigo.freebsd@minasambiente.com.br>
Date: Sat, 28 Sep 2019 23:40:44 -0300
Subject: [PATCH 3/3] Include '(target, text) =
utils.web.getUrlTargetAndContent(url, size=size,timeout=timeout)' in
try/catch block to avoid thread blocking | send http errors to log
---
plugins/Web/plugin.py | 23 +++++++++++------------
1 file changed, 11 insertions(+), 12 deletions(-)
diff --git plugins/Web/plugin.py plugins/Web/plugin.py
index 85c129de3..e67944f29 100644
--- plugins/Web/plugin.py
+++ plugins/Web/plugin.py
@@ -149,14 +149,13 @@ def noIgnore(self, irc, msg):
def getTitle(self, irc, url, raiseErrors):
size = conf.supybot.protocols.http.peekSize()
timeout = self.registryValue('timeout')
- (target, text) = utils.web.getUrlTargetAndContent(url, size=size,
- timeout=timeout)
- encoding = utils.web.getEncoding(text)
- if encoding is None: # Condition if charade not installed
- self.log.info('Web plugin TitleSnarfer: Could not guess the page\'s'
- ' encoding. (Try installing python-charade.)')
- encoding = 'utf-8' # Assume UTF-8 and replace unknown chars to the UTF-8 codec for U+FFFD in the next hop
try:
+ (target, text) = utils.web.getUrlTargetAndContent(url, size=size,timeout=timeout)
+ encoding = utils.web.getEncoding(text)
+ if encoding is None: # Condition if charade not installed
+ self.log.info('Web plugin TitleSnarfer: Could not guess the page\'s'
+ ' encoding. (Try installing python-charade.)')
+ encoding = 'utf-8' # Assume UTF-8 and replace unknown chars to the UTF-8 codec for U+FFFD in the next hop
text = text.decode(utils.web.getEncoding(text) or 'utf-8','replace')
parser = Title()
try:
@@ -170,11 +169,11 @@ def getTitle(self, irc, url, raiseErrors):
return (target, title)
else:
if len(text) < size:
- self.log.info('Web plugin TitleSnarfer: '
- 'That URL appears to have no HTML title.')
- except:
- irc.error(_('Web plugin TitleSnarfer encoding errors'),
- Raise=True)
+ self.log.info('Web plugin TitleSnarfer: <' + url + '> appears to have no HTML title.')
+ else:
+ self.log.info('Web plugin TitleSnarfer: Could not retrieve title of <' + url + '>')
+ except Exception as e:
+ self.log.info('Web plugin TitleSnarfer: <' + str(e) + '> while trying to process <' + url +'>')
@fetch_sandbox
def titleSnarfer(self, irc, msg, match):
|