Fix text_markdown and text_html (python-telegram-bot#623)

jsmnbom · jh0ker · commit 2e89e2126170 · 2017-05-21T14:00:07.000+02:00
* Fix text_markdown and text_html

* Missed a few narrow build checks

* Added tests for emoji-first strings and emojis in url
diff --git a/telegram/message.py b/telegram/message.py
@@ -628,6 +628,9 @@ def text_html(self):
         """
         entities = self.parse_entities()
         message_text = self.text
+        if not sys.maxunicode == 0xffff:
+            message_text = message_text.encode('utf-16-le')
+
         markdown_text = ''
         last_offset = 0
 
@@ -647,10 +650,18 @@ def text_html(self):
             else:
                 insert = text
 
-            markdown_text += escape_html(message_text[last_offset:entity.offset]) + insert
+            if sys.maxunicode == 0xffff:
+                markdown_text += escape_html(message_text[last_offset:entity.offset]) + insert
+            else:
+                markdown_text += escape_html(message_text[last_offset * 2:entity.offset * 2]
+                                             .decode('utf-16-le')) + insert
+
             last_offset = entity.offset + entity.length
 
-        markdown_text += escape_html(message_text[last_offset:])
+        if sys.maxunicode == 0xffff:
+            markdown_text += escape_html(message_text[last_offset:])
+        else:
+            markdown_text += escape_html(message_text[last_offset * 2:].decode('utf-16-le'))
         return markdown_text
 
     @property
@@ -667,6 +678,9 @@ def text_markdown(self):
         """
         entities = self.parse_entities()
         message_text = self.text
+        if not sys.maxunicode == 0xffff:
+            message_text = message_text.encode('utf-16-le')
+
         markdown_text = ''
         last_offset = 0
 
@@ -685,9 +699,16 @@ def text_markdown(self):
                 insert = '```' + text + '```'
             else:
                 insert = text
+            if sys.maxunicode == 0xffff:
+                markdown_text += escape_markdown(message_text[last_offset:entity.offset]) + insert
+            else:
+                markdown_text += escape_markdown(message_text[last_offset * 2:entity.offset * 2]
+                                                 .decode('utf-16-le')) + insert
 
-            markdown_text += escape_markdown(message_text[last_offset:entity.offset]) + insert
             last_offset = entity.offset + entity.length
 
-        markdown_text += escape_markdown(message_text[last_offset:])
+        if sys.maxunicode == 0xffff:
+            markdown_text += escape_markdown(message_text[last_offset:])
+        else:
+            markdown_text += escape_markdown(message_text[last_offset * 2:].decode('utf-16-le'))
         return markdown_text
diff --git a/tests/test_message.py b/tests/test_message.py
@@ -98,16 +98,46 @@ def test_parse_entities(self):
                              {entity: 'http://google.com',
                               entity_2: 'h'})
 
-    def test_text_html(self):
+    def test_text_html_simple(self):
         test_html_string = 'Test for &lt;<b>bold</b>, <i>ita_lic</i>, <code>code</code>, <a href="http://github.com/">links</a> and <pre>pre</pre>.'
         text_html = self.test_message.text_html
         self.assertEquals(test_html_string, text_html)
 
-    def test_text_markdown(self):
+    def test_text_markdown_simple(self):
         test_md_string = 'Test for <*bold*, _ita\_lic_, `code`, [links](http://github.com/) and ```pre```.'
         text_markdown = self.test_message.text_markdown
         self.assertEquals(test_md_string, text_markdown)
 
+    def test_text_html_emoji(self):
+        text = (b'\\U0001f469\\u200d\\U0001f469\\u200d ABC').decode('unicode-escape')
+        expected = (b'\\U0001f469\\u200d\\U0001f469\\u200d <b>ABC</b>').decode('unicode-escape')
+        bold_entity = telegram.MessageEntity(type=telegram.MessageEntity.BOLD, offset=7, length=3)
+        message = telegram.Message(
+            message_id=1, from_user=None, date=None, chat=None, text=text, entities=[bold_entity])
+        self.assertEquals(expected, message.text_html)
+
+    def test_text_markdown_emoji(self):
+        text = (b'\\U0001f469\\u200d\\U0001f469\\u200d ABC').decode('unicode-escape')
+        expected = (b'\\U0001f469\\u200d\\U0001f469\\u200d *ABC*').decode('unicode-escape')
+        bold_entity = telegram.MessageEntity(type=telegram.MessageEntity.BOLD, offset=7, length=3)
+        message = telegram.Message(
+            message_id=1, from_user=None, date=None, chat=None, text=text, entities=[bold_entity])
+        self.assertEquals(expected, message.text_markdown)
+
+    def test_parse_entities_url_emoji(self):
+        url = b'http://github.com/?unicode=\\u2713\\U0001f469'.decode('unicode-escape')
+        text = 'some url'
+        link_entity = telegram.MessageEntity(type=telegram.MessageEntity.URL, offset=0, length=8, url=url)
+        message = telegram.Message(
+            message_id=1,
+            from_user=None,
+            date=None,
+            chat=None,
+            text=text,
+            entities=[link_entity])
+        self.assertDictEqual(message.parse_entities(), {link_entity: text})
+        self.assertEqual(next(iter(message.parse_entities())).url, url)
+
     @flaky(3, 1)
     def test_reply_text(self):
         """Test for Message.reply_text"""