from urllib.request import urlopen
url="https://www.aozora.gr.jp/cards/000148/files/789_14547.html"
text=urlopen(url).read().decode('shift-jis')

<ruby> 
  漢 <rp>(</rp><rt>かん</rt><rp>)</rp> 字 <rp>(</rp><rt>じ</rt><rp>)</rp>
</ruby>


from bs4 import BeautifulSoup
soup=BeautifulSoup(urlopen(url), features="html.parser") # features は lxmlも可能。 lxmlの方が少し実行時間は短い。


%%timeit 
soup=BeautifulSoup(text, features="html.parser") # features は lxmlも可能。

377 ms ± 2.77 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


%%timeit 
soup=BeautifulSoup(text, features="lxml") # features は lxmlも可能。 lxmlの方が少し実行時間は短い。

<magic-timeit>:1: XMLParsedAsHTMLWarning: It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.

201 ms ± 3.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


%%timeit 
soup=BeautifulSoup(text, features="html5lib") # features は lxmlも可能。 lxmlの方が少し実行時間は短い。

<magic-timeit>:1: XMLParsedAsHTMLWarning: It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.

1.47 s ± 9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


main_text=soup.find('div', attrs={"class":"main_text"})
authors=soup.find_all(attrs={"class":"author"})
title  =soup.find(attrs={"class":"title"})
print(title.text,  [author.text for author in authors])

吾輩は猫である ['夏目漱石']


soup.original_encoding

'shift_jis'


soup.find('ruby') == soup.ruby

True


print(soup.ruby)

<ruby><rb>吾輩</rb><rp>（</rp><rt>わがはい</rt><rp>）</rp></ruby>


soup.ruby.find_all('rt')

[<rt>わがはい</rt>]


" ".join((rt.text for rt in soup.ruby.find_all('rt')))

'わがはい'


while main_text.ruby:
        rt=" ".join((rt.text for rt in main_text.ruby.find_all('rt')))
        main_text.ruby.replace_with(soup.new_string(rt));

for ruby in main_text.find_all('ruby'):
  rt=" ".join((rt.text for rt in ruby.find_all('rt')))
  ruby.replace_with(soup.new_string(rt))


from macspeechX import SpeakString
SpeakString("こんにちは。'mac-speech-X' にようこそ。")

<macspeechX.macspeechX_AVF.SpeechChannel at 0x10ad62750>


lines=main_text.text.split()
for t in lines[:2]:
      SpeakString(t)


#!python3
# -*- coding:utf-8 -*-
"""
青空文庫のルビ付き作品を、macspeechXを使って、読み上げる。
"""
import macspeechX
from macspeechX import SpeakString
from bs4 import BeautifulSoup
from urllib.request import urlopen
from io import BytesIO

farady_no_den_url="https://www.aozora.gr.jp/cards/001234/files/46340_24939.html"
wagahai_url="https://www.aozora.gr.jp/cards/000148/files/789_14547.html"
ginga_tetsudo_url="https://www.aozora.gr.jp/cards/000081/files/456_15050.html"

def test(url=ginga_tetsudo_url, voice=0):
    v=macspeechX.Voice(voice)
    soup=BeautifulSoup(urlopen(url), features="html.parser")
    main_text=soup.find("div", attrs={"class":"main_text"})
    authors=soup.find_all(attrs={"class":"author"})
    title  =soup.find(attrs={"class":"title"})
    # ルビー タグを展開して、テキストに埋め込む。
    while main_text.ruby:
        rt=" ".join((rt.text for rt in main_text.ruby.find_all('rt')))
        main_text.ruby.replace_with(soup.new_string(rt));
    lines=main_text.text.split()
    #
    SpeakString(title.text,v)
    for author in authors:
        SpeakString(author.text,v)
    for t in lines:
        SpeakString(t,v)


test("https://www.aozora.gr.jp/cards/000148/files/2672_6499.html",voice="Otoya(Enhanced)")

Python入門講座　[番外編]：　AozoraReader¶

概要¶

ファイルのダウンロード¶

ルビの展開¶

BeautifulSoup の利用法¶

補足説明¶

テキストを読み上げる¶

macspeechX¶

まとめ¶

Python入門講座 [番外編]： AozoraReader¶

概要¶

ファイルのダウンロード¶

ルビの展開¶

BeautifulSoup の利用法¶

補足説明¶

テキストを読み上げる¶

macspeechX¶

まとめ¶

Python入門講座　[番外編]：　AozoraReader¶